I'm trying to utilise list comprehension for sorting data from a very large file. The file structure is like so:
THING
info1
info2
info3
THING
info1
info2
info3
... and so on.
Basically trying to collect all info1 into a list and all info2 into another list. I have a previous script which does this, but it's slow. I'm also trying to make it object oriented so I can use the data more efficiently.
Old script:
info1_data = []
info2_data = []
with open(myfile) as f:
for line in f:
if re.search('THING',line):
line=next(f)
info1_data.append(line)
line=next(f)
info2_data.append(line)
New script:
def __init__(self, file):
self.file = file
def sort_info1(self):
with self.file as f:
info1_data = [next(f) for line in f if re.search('THING',line)]
return info1_data
def sort_info2(self):
with self.file as f:
info2_data = [next(f).next(f) for line in f if re.search('THING',line)]
return info2_data
The new script works for getting info1_data as a list. However, to get info2_data I can't find anything for skipping 2 lines using this method. I guessed at next(f).next(f). It runs but doesn't produce anything.
Is this possible?
Many thanks.
Following help from Moses I've this solution. The islice is very confusing though and I don't fully understand it, even after reading the python.docs. Does the iterable get the data (i.e., info1 or info2) or do the start, stop and step dictate what data is extracted?
islice(iterable, start, stop[, step])
from itertools import islice
import re
class SomeClass(object):
def __init__(self, file):
self.file = file
def search(self, word, i):
self.file.seek(0) # seek to start of file
for line in self.file:
if re.search(word, line) and i == 0:
line = next(self.file)
yield line
elif re.search(word, line) and i == 1:
line = next(self.file)
line = next(self.file)
yield line
def sort_info1(self):
return list(islice(self.search('THING',0), 0, None, 2))
def sort_info2(self):
return list(islice(self.search('THING',1), 2, None, 2))
info1 = SomeClass(open("test.dat")).sort_info1()
info2 = SomeClass(open("test.dat")).sort_info2()
You should the seek the file back to the start in order to repeat the search from the beginning of the file. Also, you could use a generator function to decouple the search operation from the production of the data. Then use itertools.islice to step over lines:
from itertools import islice
class SomeClass(object):
def __init__(self, file):
self.file = file
def search(self, word):
self.file.seek(0) # seek to start of file
for line in self.file:
if re.search(word, line):
# yield next two lines
yield next(self.file)
yield next(self.file)
def sort_info1(self):
return list(islice(self.search('THING'), 0, None, 2))
def sort_info2(self):
return list(islice(self.search('THING'), 1, None, 2))
However instead of passing the file, I'll suggest you pass the path to the file instead so the file can be closed after each use, to avoid holding up resources when they are not (or not yet) needed.
You can do like this:
def sort_info2(self):
with self.file as f:
info2_data = [(next(f),next(f))[1] for line in f if re.search('THING',line)]
return info2_data
But it looks a little weird way!
Related
As the title says, I made a file editing program with python.
Here is the code that I'm have a problem with:
#fileEditing.py
def fileError(file):
raise OSError("file {} does not exist".format(file))
class AccessFile():
def fileExists(self, file):
import os
return bool(os.path.exists(file))
def filecreate(self, file):
if not self.fileExists(file):
with open(file, "w") as f:
f.close()
else: raise OSError("file {} already exists".format(file))
def filedelete(self, file):
import os
if self.fileExists(file):
os.remove(file)
else: fileError(file)
def fileread(self, file):
#check if file exists
if self.fileExists(file):
#detect length of file
with open(file, "r") as f:
line = " "
x = 0
while line != "":
line = f.readline()
x += 1
#piece lines together in a list
filelines = []
with open(file, "r") as f:
for i in range(x - 1):
filelines.append(str(f.readline()))
#return a tuple
return tuple(filelines)
else: fileError(file)
def filewrite(self, file, line, text):
''' BUG: apparently this either overwrites the line its writing or appends
to the line its writing... make up your mind!'''
if self.fileExists(file):
#get file contents
filelines = list(self.fileread(file))
#see if line parameter is out of range or not
try:
filelines[line] = text
except IndexError:
for i in range(line - len(filelines)):
filelines.append("")
filelines.append(str(text) + "\n")
#apply changes
with open(file, "w") as f:
f.write("") #delete contents
with open(file, "w") as f:
for l in filelines:
f.write(l)
else: fileError(file)
def fileoverwrite(self, file, data):
#if there is no file to delete, it will make a new one
try:
self.filedelete(file)
except:
pass
self.filecreate(file)
x = 0
for line in data:
print(line)
self.filewrite(file, x, line)
x += 1
accessfile = AccessFile()
The bug is in the filewrite(self, file, line, text) function. When called, it either writes a new line (which is what I want it to do), appends to the line its supposed to replace, or just doesn't write any lines at all.
Say I want to write a python file with this program:
#pytesting.py
from fileEditing import *
file = "/Users/ashton/Desktop/Atom/Python/FileEditing/FileManager.py"
data = [
"from fileEditing import *",
"",
"class FileEditing():",
" def __init__(options, immutable_files):",
" self.options, self.immutable_files = options, immutable_files",
" ",
" def prompt():",
" ",
"",
"while True:",
" pass"
]
accessfile.fileoverwrite(file, data)
When I run it, it makes a file with accessfile.fileoverwrite(file, data), like its supposed to.
But thats where things get whacky.
(FileManager.py below)
from fileEditing import *
class FileEditing():
def __init__(options, immutable_files): self.options, self.immutable_files = options, immutable_files
def prompt():
while True:
If you know how to fix the filewrite(self, file, line, text), please let me know.
(I use python 2.7 but python 3 is fine)
So this is definitely a Python 3.x solution but you said that it is fine, don't know if it will work in Python 2.x but it is so simple it should:
def file_overwrite(self, file, data):
with open(file, 'w') as file:
file.write('\n'.join(data))
And you seemingly also need to fix that data list because it is missing a few commas. Also the fact that this is all in a class is a bit weird, you do nothing with the instance, they all might as well be separate functions or #classmethods or #staticmethods. Also several things could be improved with your other functions. For example you shouldn't open the file twice and count its lines to read it. Just do file.readlines() at it will return a list of all lines:
def fileread(self, file):
if self.fileExists(file):
with open(file) as file:
return file.readlines()
else:
fileError(file)
Then also import os once at the start of the file, you don't need to import it in every function where you use os, also:
with open(file, "w") as f:
f.close()
f.close() is completely pointless because the context manger closes the file anyways and also there is mode "x" which is specifically made for file creation and will raise an error if the file already exists: https://www.w3schools.com/python/python_file_handling.asp
My intention was to copy a piece of string after either a colon or equal sign from File 1 , and pasting that string in File 2 in a similar location after either a colon or equal sign.
For instance, if File 1 has:
username: Stack
File 2 is originally empty:
username=
I want Stack to be copied over to File 2 after username. Currently, I'm stuck and not sure what to do. The program piece I made below doesn't copy the username. I would greatly appreciate any input!
with open("C:/Users/SO//Downloads//f1.txt", "r") as f1:
with open("C:/Users/SO//Downloads//f2.txt", "r+") as f2:
searchlines = f1.readlines()
searchlines_f2=f2.readlines()
for i, line in enumerate(searchlines):
if 'username' in line:
for l in searchlines[i:i+1]:
ind = max(l.find(':'), l.find('='), 0) #finding index of specific characters
copy_string=l[ind+1:].strip() #copying string for file 2
for l in searchlines_f2[i:i+1]:
if 'username' in line:
f2.write(copy_string)
I think something like this will get you what you need in a more maintainable and Pythonic way.
Note the use of regex as well as some string methods (e.g., startswith)
import re
SOURCE_PATH = "C:/Users/SO//Downloads//f1.txt"
TARGET_PATH = "C:/Users/SO//Downloads//f2.txt"
def _get_lines(filepath):
""" read `filepath` and return a list of strings """
with open(filepath, "r+") as fh:
return fh.readlines()
def _get_value(fieldname, text):
""" parse `text` to get the value of `fieldname` """
try:
pattern = '%s[:=]{1}\s?(.*)' % fieldname
return re.match(pattern, text).group(1)
except IndexError:
# you may want to handle this differently!
return None
def _write_target(filepath, trgt_lines):
""" write `trgt_lines` to `filepath` """
with open(filepath, "w+") as fh:
fh.writelines(trgt_lines)
src_lines = _get_lines(SOURCE_PATH)
trgt_lines = _get_lines(TARGET_PATH)
# extract field values from source file
fields = ['username', 'id', 'location']
for field in fields:
value = None
for cur_src in src_lines:
if cur_src.startswith(field):
value = _get_value(field, cur_src)
break
# update target_file w/ value (if we were able to find it)
if value is not None:
for i, cur_trgt in enumerate(trgt_lines):
if cur_trgt.startswith('{0}='.format(field)):
trgt_lines[i] = '{0}={1}'.format(field, value)
break
_write_target(TARGET_PATH, trgt_lines)
Is it possible to use python to skip blocks of text when writing a file from another file?
For example lets say the input file is:
This is the file I would like to write this line
I would like to skip this line
and this one...
and this one...
and this one...
but I want to write this one
and this one...
How can I write a script that allows me to skip certain lines that differ in content and size which resumes writing the lines to another file once it recognizes a certain line?
My code reads through the lines, doesn't write duplicate lines and performs some operation on the line by using dictionaries and regex.
def is_wanted(line):
#
# You have to define this!
#
# return True to keep the line, or False to discard it
def copy_some_lines(infname, outfname, wanted_fn=is_wanted):
with open(infname) as inf, open(outfname, "w") as outf:
outf.writelines(line for line in inf if wanted_fn(line))
copy_some_lines("file_a.txt", "some_of_a.txt")
In order to extend this to multi-line blocks, you can implement a finite state machine like
which would turn into something like
class BlockState:
GOOD_BLOCK = True
BAD_BLOCK = False
def __init__(self):
self.state = self.GOOD_BLOCK
def is_bad(self, line):
# *** Implement this! ***
# return True if line is bad
def is_good(self, line):
# *** Implement this! ***
# return True if line is good
def __call__(self, line):
if self.state == self.GOOD_BLOCK:
if self.is_bad(line):
self.state = self.BAD_BLOCK
else:
if self.is_good(line):
self.state = self.GOOD_BLOCK
return self.state
then
copy_some_lines("file_a.txt", "some_of_a.txt", BlockState())
Pseudo-code:
# Open input and output files, and declare the unwanted function
for line in file1:
if unwanted(line):
continue
file2.write(line)
# Close files etc...
You can read the file line by line, and have control on each line you read:
with open(<your_file>, 'r') as lines:
for line in lines:
# skip this line
# but not this one
Note that if you want to read all lines despite the content and only then manipulate it, you can:
with open(<your_file>) as fil:
lines = fil.readlines()
This should work:
SIZE_TO_SKIP = ?
CONTENT_TO_SKIP = "skip it"
with open("my/input/file") as input_file:
with open("my/output/file",'w') as output_file:
for line in input_file:
if len(line)!=SIZE_TO_SKIP and line!=CONTENT_TO_SKIP:
output_file.write(line)
I do have following code where I am its doing following thing.
Parsing Whole file and checking for patter in each line. If the pattern exists, it should return that pattern to main function and print the value.
Issue: The function is only returning 1st pattern and do not check for same pattern into multiple lines.
code:
import re
import sys
import os
def find_pattern(file):
with open(file) as fp:
for line in fp:
if "abc" in line:
return line
else:
continue
def check(file):
return_list = []
data=find_pattern(file)
if data != None:
return_list.append(data)
if not data:
return "working"
else:
return return_list
if __name__== '__main__':
file = sys.argv[1]
print check(file)
If the file has multiple line containing abc, it will print only 1st line and skip other lines. I want to print all lines that contains abc.
Sample file
sdfdsffdabcafsfse
asasssadfsdadsadsaddsadadabc
asfasfdsafdfabc
output with above code:
sdfdsffdabcafsfse
You are prematurely returning from the function on this line:
return line
Which means you exit the function and the loop ceases to iterate assoon as the first instance is found.
Consider something like this instead, where you capture and return all matches:
def find_pattern(file):
out = []
with open(file) as fp:
for line in fp:
if "abc" in line:
out.append(line)
else:
continue
return out
Alternatively, you can manage this in a single, simple list comprehension:
def find_pattern(file):
with open(file) as fp:
return [line for line in fp if "abc" in line]
I wrote a class to deal with large files and I want to make a "write" method for the class so that I can easily make changes to the data in the file and then write out a new file.
What I want to be able do is:
1.) Read in the original file
sources = Catalog(<filename>)
2.) Make changes on the data contained in the file
for source in sources:
source['blah1'] = source['blah1'] + 4
3.) Write out the updated value to a new file
sources.catalog_write(<new_filename>)
To this end I wrote a fairly straightforward generator,
class Catalog(object):
def __init__(self, fname):
self.data = open(fname, 'r')
self.header = ['blah1', 'blah2', 'blah3']
def next(self):
line = self.data.readline()
line = line.lstrip()
if line == "":
self.data.close()
raise StopIteration()
cols = line.split()
if len(cols) != len(self.header):
print "Input catalog is not valid."
raise StopIteration()
for element, col in zip(self.header, cols):
self.__dict__.update({element:float(col)})
return self.__dict__.copy()
def __iter__(self):
return self
This is my attempt at a write method:
def catalog_write(self, outname):
with open(outname, "w") as out:
out.write(" ".join(self.header) + "\n")
for source in self:
out.write(" ".join(map(str, source)) + "\n")
But I get the following error when I try to call that class method,
File "/Catalogs.py", line 53, in catalog_write
for source in self:
File "/Catalogs.py", line 27, in next
line = self.data.readline()
ValueError: I/O operation on closed file
I realize that this is because generators are generally a one time deal but I know that there are workarounds to this (like this question and this post but I'm not sure what the best way to do this is. These files are quite large and I'd like their read in and use to be as efficient as possible (both time-wise and memory-wise). Is there a pythonic way to do this?
Assumptions made:
Input File: [ infile ]
1.2 3.4 5.6
4.5 6.7 8.9
Usage:
>>> a = Catalog('infile')
>>> a.catalog_write('outfile')
Now Output File: [ outfile ]
blah1 blah2 blah3
1.2 3.4 5.6
4.5 6.7 8.9
Writing it again to another file: [ outfile2 ]
>>> a.catalog_write('outfile2')
Now Output File: [ outfile2 ]
blah1 blah2 blah3
1.2 3.4 5.6
4.5 6.7 8.9
So from what you have posted, looks like you need to reopen your data [ Assuming it is the file object with file name as self.fname ].
Modify your __init__ to save the fname as an attribute
Create a data object initially [ I am not opening it at __init__ stage, so that you could open and close when needed all inside your next() method ] I have just created the data as an object so that it can have an attribute closed like a file object, so that you could check whether self.data.closed is True and reopen the same from inside your next() method and read from the same.
def __init__(self, fname):
self.fname = fname
self.data = object()
self.data = lambda: None
self.data.closed = True
self.header = ['blah1', 'blah2', 'blah3']
Now the next method is modified as follows :
def next(self):
if self.data.closed:
self.data = open(self.fname, "r")
line = self.data.readline()
line = line.lstrip()
if line == "":
if not self.data.closed:
self.data.close()
raise StopIteration()
cols = line.split()
if len(cols) != len(self.header):
print "Input catalog is not valid."
if not self.data.closed:
self.data.close()
raise StopIteration()
for element, col in zip(self.header, cols):
self.__dict__.update({element:float(col)})
return self.__dict__.copy()
Your catalog_write method should be as follows :
Note that any modifications to data must be done within the for loop as shown.
def catalog_write(self, outname):
with open(outname, "w") as out:
out.write(" ".join(self.header) + "\n")
for source in self:
source['blah1'] = 444 # Data modified.
out.write(" ".join(map(str, [source[self.header[i]] for i in range(len(self.header)) ])) + "\n")
I assumed that you want the updated values of the headers written as a column in the outname file.