Is there a better way of doing this? - python

Is there another way of checking if something is first?
I've been using for i,f in enumerate(read_files) where I enumerate a list of files, and use an if statement to check if i==0. I'm curious is there is a different (better, faster, less typed) way to do this?
read_files = glob.glob("post_stats_*.tsv")
with open("result.tsv", "w") as outfile:
for i,f in enumerate(read_files):
with open(f, "r") as infile:
metric_name = (f.strip(".tsv").split("_")[2])
if i == 0:
outfile.write(metric_name.upper() + "\n" + infile.read())
else:
outfile.write("\n" + metric_name.upper() + "\n" + infile.read())

Since it seems the only use of the if is to avoid a blank line at the start of the output file, how about putting the blank line after the file's contents? That will lead to a blank line at the end of the file where it's unlikely to hurt:
read_files = glob.glob("post_stats_*.tsv")
with open("result.tsv", "w") as outfile:
for f in read_files:
with open(f, "r") as infile:
metric_name = (f.strip(".tsv").split("_")[2])
outfile.write(metric_name.upper() + "\n" + infile.read() + "\n")

Related

Compare multiple text files, and save commons values

My actual code :
import os, os.path
DIR_DAT = "dat"
DIR_OUTPUT = "output"
filenames = []
#in case if output folder doesn't exist
if not os.path.exists(DIR_OUTPUT):
os.makedirs(DIR_OUTPUT)
#isolating empty values from differents contracts
for roots, dir, files in os.walk(DIR_DAT):
for filename in files:
filenames.append("output/" + os.path.splitext(filename)[0] + ".txt")
filename_input = DIR_DAT + "/" + filename
filename_output = DIR_OUTPUT + "/" + os.path.splitext(filename)[0] + ".txt"
with open(filename_input) as infile, open(filename_output, "w") as outfile:
for line in infile:
if not line.strip().split("=")[-1]:
outfile.write(line)
#creating a single file from all contracts, nb the values are those that are actually empty
with open(DIR_OUTPUT + "/all_agreements.txt", "w") as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
#finale file with commons empty data
#creating a single file
with open(DIR_OUTPUT + "/all_agreements.txt") as infile, open(DIR_OUTPUT + "/results.txt", "w") as outfile:
seen = set()
for line in infile:
line_lower = line.lower()
if line_lower in seen:
outfile.write(line)
else:
seen.add(line_lower)
print("Psst go check in the ouptut folder ;)")
The last lines of my code are checking wether or not, element exists mutliple times. So, may the element exists, once, twice, three, four times. It will add it to results.txt.
But the thing is that I want to save it into results.txt only if it exists 4 times in results.txt.
Or best scenario, compare the 4 .txt files and save elements in commons into results.txt.
But I can't solve it..
Thanks for the help :)
To make it easier,
with open(DIR_OUTPUT + "/all_agreements.txt") as infile, open(DIR_OUTPUT + "/results.txt", "w") as outfile:
seen = set()
for line in infile:
if line in seen:
outfile.write(line)
else:
seen.add(line)
Where can I use the .count() function ?
Because I want to do something like xxx.count(line) == 4 then save it into resulsts.txt
If your files are not super big you can use set.intersection(a,b,c,d).
data = []
for fname in filenames:
current = set()
with open(fname) as infile:
for line in infile:
current.add(line)
data.append(current)
results = set.intersection(*data)
You also don't need to create one single big file for this issue.
Not sure how your input looks like or what output is expected...
But maybe this can spark some ideas:
from io import StringIO
from collections import Counter
lines = ["""\
a=This
b=is
c=a Test
""", """\
a=This
b=is
c=a Demonstration
""", """\
a=This
b=is
c=another
d=example
""", """\
a=This
b=is
c=so much
d=fun
"""]
files = (StringIO(l) for l in lines)
C = Counter(line for f in files for line in f)
print([k for k,v in C.items() if v >= 4])
# Output: ['a=This\n', 'b=is\n']

Saving text file in a for loop

I'm trying to loop through a file, strip the sentences into individual lines, and then export that data.
filename = '00000BF8_ar.txt'
with open(filename, mode="r") as outfile:
str_output = outfile.readlines()
str_output = ''.join(str_output)
sentenceSplit = filter(None, str_output.split("."))
for s in sentenceSplit:
print(s.strip() + ".")
#output += s
myfile = open(filename, 'w')
myfile.writelines(s)
myfile.close()
Unfortunately, it looks like the loop only goes through a few lines and saves them. So the whole file isn't looped through and saved. Any help on how I can fix that?
Here is the code I hope this is what you want to achieve,
filename = '00000BF8_ar.txt'
with open(filename, mode="r") as outfile:
str_output = outfile.readlines()
str_output = ''.join(str_output)
sentenceSplit = filter(None, str_output.split("."))
l=[]
for s in sentenceSplit:
l.append(s.strip() + ".")
myfile = open(filename, 'w')
myfile.write('\n'.join(l))
myfile.close()
Each time you re-open the file with the 'w' option, you basically erase its content.
Try modifying your code like this:
filename = '00000BF8_ar.txt'
with open(filename, "r") as infile:
str_output = infile.readlines()
str_output = ''.join(str_output)
sentenceSplit = filter(None, str_output.split("."))
with open(filename, "w") as outfile:
for s in sentenceSplit:
print(s.strip() + ".")
#output += s
s.writelines(s)
Another way to achieve the same thing would have been to open a new file using open(filename_new, 'a') which open a file for appending, but as a rule of thumb try not to open/close files inside a loop.
open(filename, 'w') will overwrite the file every time it starts. My guess is that what's currently happening is that only the last element in sentenceSplit is showing up in myfile.
The simple "solution" is to use append instead of write:
open(filename, 'a')
which will simply start writing at the end of the file, without deleting the rest of it.
However, as #chepner's comment states, why are you reopening the file at all? I would recommend changing your code to this:
with open(filename, mode="r") as outfile:
str_output = outfile.readlines()
str_output = ''.join(str_output)
sentenceSplit = filter(None, str_output.split("."))
with open(filename, mode='w') as myfile:
for s in sentenceSplit:
print(s.strip() + ".")
myfile.writelines(s)
This way, instead of opening it many times, and overwriting it every time, you're only opening it once and just writing to it continuously.

Writting several files according to an element of an original file

I need to read a file in bed format that contains coordinates of all chr in a genome, into different files according with the chr. I tried this approach but it doesn't work, it doesn't create any files. Any idees why this happens or alternative approaches to solve this problem?
import sys
def make_out_file(dir_path, chr_name, extension):
file_name = dir_path + "/" + chr_name + extension
out_file = open(file_name, "w")
out_file.close()
return file_name
def append_output_file(line, out_file):
with open(out_file, "a") as f:
f.write(line)
f.close()
in_name = sys.argv[1]
dir_path = sys.argv[2]
with open(in_name, "r") as in_file:
file_content = in_file.readlines()
chr_dict = {}
out_file_dict = {}
line_count = 0
for line in file_content[:0]:
line_count += 1
elems = line.split("\t")
chr_name = elems[0]
chr_dict[chr_name] += 1
if chr_dict.get(chr_name) = 1:
out_file = make_out_file(dir_path, chr_name, ".bed")
out_file_dict[chr_name] = out_file
append_output_file(line, out_file)
elif chr_dict.get(chr_name) > 1:
out_file = out_file_dict.get(chr_name)
append_output_file(line, out_file)
else:
print "There's been an Error"
in_file.close()
This line:
for line in file_content[:0]:
says to iterate over an empty list. The empty list comes from the slice [:0] which says to slice from the beginning of the list to just before the first element. Here's a demonstration:
>>> l = ['line 1\n', 'line 2\n', 'line 3\n']
>>> l[:0]
[]
>>> l[:1]
['line 1\n']
Because the list is empty no iteration takes place, so the code in the body of your for loop in not executed.
To iterate over each line of the file you do not need the slice:
for line in file_content:
However, it is better again to iterate over the file object as this does not require that the whole file be first read into memory:
with open(in_name, "r") as in_file:
chr_dict = {}
out_file_dict = {}
line_count = 0
for line in in_file:
...
Following that there are numerous problems, including syntax errors, with the code in the for loop which you can begin debugging.

How can I make this program more efficient

This program is a basic encoder in python and I want to see if I can make it more efficient without changing the names of the defined variables. Can someone give me some suggestions?
def encode(pattern, filename):
f = open(filename, "rt")
contents = f.read()
f.close()
printNow(contents)
changes = pattern.split("|")
for str in changes:
printNow("Change "+ str[0] + " to " + str[1])
newMsg = ""
for char in contents:
for change in changes:
if char == change [0]:
char = change[1]
newMsg += char
f = open(filename + "encoded", "wt")
f.write(newMsg)
f.close()
f = open(filename + "encoded", "rt")
printNow(f.read())
f.close()
encode("ae|ga|s3", "C:\\Users\\Shaun\\Desktop\\Test.txt")
import string
def encode(pattern, filename):
with open(filename) as f:
contents = f.read()
s = string.maketrans(*[''.join(a) for a in zip(*pattern.split('|'))])
newMsg = contents.translate(s)
with open(filename + 'encoded', 'rt') as f:
f.write(newMsg)
Use str.translate() instead of doing all the replacements the hard way, and do it line-by-line.
First of all you need to consider the option that your algorithm is already good enough. Even if it can be optimized, if your code is part of a bigger program and it only executes during 0.1% of time, for instance, then it will be most probably useless to optimize the code, since the rest of the program will dominate the total execution time.
If you really have a problem in your code, then I would start by analyzing the complexity of your algorithm.
And finally, you could try to find some bottlenecks in your code. For that, I would profile the code with something like python's timeit.
The str.translate() method works well for character substitutions, but here's another fast way I've used that also works for multi-character substitutions:
import re
def encode(pattern, filename):
f = open(filename, "rt")
contents = f.read()
f.close()
printNow(contents)
change_dict = {}
matches = []
changes = pattern.split("|")
for str in changes:
printNow("Change "+ str[0] + " to " + str[1])
change_dict[str[0]] = str[1]
matches.append(str[0])
change_re = re.compile("|".join(re.escape(x) for x in matches))
newMsg = change_re.sub(lambda m: change_dict[m.group(0)], contents)
f = open(filename + "encoded", "wt")
f.write(newMsg)
f.close()
f = open(filename + "encoded", "rt")
printNow(f.read())
f.close()
encode("ae|ga|s3", "C:\\Users\\Shaun\\Desktop\\Test.txt")

How to add a copy of each line in a txt (containing two parts separated by a particular symbol) with the order of the two parts inverted?

I have txt with a number of lines (x#y). Each file has two parts (x, y) separated by a particular symbol (#). How would a python script that reads each line in a txt and adds a new line under each existing line, where the order of the two parts (x#y) is inverted (y#x).
What I'm trying to do presented as input/output:
INPUT:
x1#y1
x2#y2
x3#y3
OUTPUT:
x1#y1
y1#x1
x2#y2
y2#x2
x3#y3
y3#x3
How can this be done with python?
Here's one way:
infilename = 'in.dat'
outfilename = 'out.dat'
sep = '#'
with open(infilename) as infile, open(outfilename,'w') as outfile:
for line in infile:
split = line.strip().partition(sep)
outfile.write(line)
outfile.write(''.join(reversed(split)) + '\n')
and then
~/coding$ cat in.dat
x1#y1
x2#y2
x3#y3
~/coding$ python inverter.py
~/coding$ cat out.dat
x1#y1
y1#x1
x2#y2
y2#x2
x3#y3
y3#x3
Assumes the name of your file is bar.txt, and that you want to write it back to bar.txt. It also does no error checking nor cares about memory usage.
if __name__ == "__main__":
myfile = open("bar.txt", "rb")
lines = myfile.readlines()
myfile.close()
myfile = open("bar.txt", "wb")
for l in lines:
ls = l.strip()
myfile.write(ls + "\n")
lsplit = ls.split("#")
myfile.write(lsplit[1] + "#" + lsplit[0] + "\n")
myfile.close()
There are cleaner ways to do this, but you could use something like:
f = open('my_file.txt', 'r')
lines = f.readlines()
f.close()
outfile = open('my_file2.txt', 'w')
# write each line, followed by flipped line
for line in lines:
outfile.write('%s\n' % line)
parts = line.split('#')
outfile.write('%s#%s\n' % [parts[1], parts[0]])
outfile.close()
You can use open and read function to read your file and than use this function,
>>> st = "x1#y1"
>>> def myfunc(string):
... mylist = re.split(r'(#)',string)
... mylist.reverse()
... print "".join(mylist), string
...
>>> myfunc(st)
y1#x1 x1#y1
and than use write to write the strings into your new file.
def swap(delimiter="#", input="input.txt", ouput="output.txt"):
with open(input, "r") as input_file, open(ouput, "w") as output_file:
for line in input_file:
line = line.strip()
output_line = delimiter.join(reversed(line.split(delimiter)))
output_file.write(line+"\n")
output_file.write(output_line+"\n")
swap()
Riffing on #DSM:
with open(infilename) as infile, open(outfilename, 'w') as outfile:
lines = [line.rstrip() for line in infile]
outfile.write("\n".join("%s\n%s%s%s" (line, y, sep, x)
for line in lines
for x, y in line.split(sep)) + "\n")
lines could also be a generator statement instead of a list comprehension:
lines = (line.rstrip() for line in infile)
Later: I did not realize until now that OP wanted the original line followed by the reversed line. Adjusted accordingly.

Categories