How to split text file by id in python - python

I have a bunch of text files containing tab separated tables. The second column contains an id number, and each file is already sorted by that id number. I want to separate each file into multiple files by the id number in column 2. Here's what I have.
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(readpath+filename, 'r') as fh:
lines = fh.readlines()
lastid = 0
f = open(writepath+'checkme.txt', 'w')
f.write(filename)
for line in lines:
thisid = line.split("\t")[1]
if int(thisid) <> lastid:
f.close()
f = open(writepath+thisid+'-'+filename,'w')
lastid = int(thisid)
f.write(line)
f.close()
What I get is simply a copy of all the read files with the first id number from each file in front of the new filenames. It is as if
thisid = line.split("\t")[1]
is only done once in the loop. Any clue to what is going on?
EDIT
The problem was my files used \r rather than \r\n to terminate lines. Corrected code (simply adding 'rU' when opening the read file and swapping != for <>):
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(readpath+filename, 'rU') as fh:
lines = fh.readlines()
lastid = 0
f = open(writepath+'checkme.txt', 'w')
f.write(filename)
for line in lines:
thisid = line.split("\t")[1]
if int(thisid) != lastid:
f.close()
f = open(writepath+thisid+'-'+filename,'w')
lastid = int(thisid)
f.write(line)
f.close()

If you're dealing with tab delimited files, then you can use the csv module, and take advantage of the fact that itertools.groupby will do the previous/current tracking of the id for you. Also utilise os.path.join to make sure your filenames end up joining correctly.
Untested:
import os
import csv
from itertools import groupby
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(os.path.join(readpath, filename)) as fin:
tabin = csv.reader(fin, delimiter='\t')
for file_id, rows in groupby(tabin, lambda L: L[1]):
with open(os.path.join(writepath, file_id + '-' + filename), 'w') as fout:
tabout = csv.writer(fout, delimiter='\t')
tabout.writerows(rows)

Related

Check if timestamp is between limits

I m looking for a solution to make inside a reading log file processus a compare to check the time range within of the current line
currently I have :
variables : datetime_ed & datetime_sd as enddate & startdate with each fd_fmt as '%Y-%m-%d-%H-%M'
the current existing code block looks like
with open(f, "r") as fi:#, open(output_file, "w") as fo:
for line in fi:
#for pattern in REGEXES:
if REGEXES[0].search(line): # search by ID instead of the wordslist at first
match = REGEXES[1].search(line)
if match:
line_id = match.group(1)
# doing stuff here
so I would do something like
with open(f, "r") as fi:#, open(output_file, "w") as fo:
for line in fi:
#for pattern in REGEXES:
if REGEXES[0].search(line): # search by ID instead of the wordslist at first
match = REGEXES[1].search(line)
#ONLY IF LINE CONTAIN DATE BETWEEN datetime_ed & datetime_sd
if match:
line_id = match.group(1)
# doing stuff here
how to do that ?
didn't found a function to do so in datetime documents
as temporary solution I made that :
with open(f, "r") as fi:#, open(output_file, "w") as fo:
for line in fi:
try:
ts = datetime.datetime.strptime(line[:16], fd_fmt) # try/except to avoid issues with non-dated lines
except ValueError:
pass
#print("ts ",ts)
if datetime_sd < ts < datetime_ed :
#for pattern in REGEXES:
if REGEXES[0].search(line): # search by ID instead of the wordslist at first
match = REGEXES[1].search(line)
if match:
#etc...
Is this good / correct ?

How to replace two different lines of text?

I need to create a file that changes the date and name of a .txt, but I can only change one or the other with this code I found on the internet, can anyone give me any tips?
Print
import os
from ast import Str
file = open("example.txt", "r")
replacement = ""
data = "02/07/2022"
name = "Alan"
for line in file:
line = line.strip()
changes = line.replace("__/__/____", data)
replacement = replacement + changes + "\n"
file.close()
fout = open("final.txt", "w")
fout.write(replacement)
fout.close()
You don't need to do this a line a time. You can replace that entire program with this:
data = "02/07/2022"
name = "Alan"
text = open("example.txt", "r").read().replace("__/__/____", data)
open("final.txt", "w").write(text)

Compare multiple text files, and save commons values

My actual code :
import os, os.path
DIR_DAT = "dat"
DIR_OUTPUT = "output"
filenames = []
#in case if output folder doesn't exist
if not os.path.exists(DIR_OUTPUT):
os.makedirs(DIR_OUTPUT)
#isolating empty values from differents contracts
for roots, dir, files in os.walk(DIR_DAT):
for filename in files:
filenames.append("output/" + os.path.splitext(filename)[0] + ".txt")
filename_input = DIR_DAT + "/" + filename
filename_output = DIR_OUTPUT + "/" + os.path.splitext(filename)[0] + ".txt"
with open(filename_input) as infile, open(filename_output, "w") as outfile:
for line in infile:
if not line.strip().split("=")[-1]:
outfile.write(line)
#creating a single file from all contracts, nb the values are those that are actually empty
with open(DIR_OUTPUT + "/all_agreements.txt", "w") as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
#finale file with commons empty data
#creating a single file
with open(DIR_OUTPUT + "/all_agreements.txt") as infile, open(DIR_OUTPUT + "/results.txt", "w") as outfile:
seen = set()
for line in infile:
line_lower = line.lower()
if line_lower in seen:
outfile.write(line)
else:
seen.add(line_lower)
print("Psst go check in the ouptut folder ;)")
The last lines of my code are checking wether or not, element exists mutliple times. So, may the element exists, once, twice, three, four times. It will add it to results.txt.
But the thing is that I want to save it into results.txt only if it exists 4 times in results.txt.
Or best scenario, compare the 4 .txt files and save elements in commons into results.txt.
But I can't solve it..
Thanks for the help :)
To make it easier,
with open(DIR_OUTPUT + "/all_agreements.txt") as infile, open(DIR_OUTPUT + "/results.txt", "w") as outfile:
seen = set()
for line in infile:
if line in seen:
outfile.write(line)
else:
seen.add(line)
Where can I use the .count() function ?
Because I want to do something like xxx.count(line) == 4 then save it into resulsts.txt
If your files are not super big you can use set.intersection(a,b,c,d).
data = []
for fname in filenames:
current = set()
with open(fname) as infile:
for line in infile:
current.add(line)
data.append(current)
results = set.intersection(*data)
You also don't need to create one single big file for this issue.
Not sure how your input looks like or what output is expected...
But maybe this can spark some ideas:
from io import StringIO
from collections import Counter
lines = ["""\
a=This
b=is
c=a Test
""", """\
a=This
b=is
c=a Demonstration
""", """\
a=This
b=is
c=another
d=example
""", """\
a=This
b=is
c=so much
d=fun
"""]
files = (StringIO(l) for l in lines)
C = Counter(line for f in files for line in f)
print([k for k,v in C.items() if v >= 4])
# Output: ['a=This\n', 'b=is\n']

Deleting one line within txt file in Python

I am having problems deleting a specific line/entry within a text file. With the code I have the top line in the file is deleted no matter what line number I select to delete.
def erase():
contents = {}
f = open('members.txt', 'a')
f.close()
f = open('members.txt', 'r')
index = 0
for line in f:
index = index + 1
contents[index] = line
print ("{0:3d}) {1}".format(index,line))
f.close()
total = index
entry = input("Enter number to be deleted")
f = open('members.txt', 'w')
index = 0
for index in range(1,total):
index = index + 1
if index != entry:
f.write(contents[index])
Try this:
import sys
import os
def erase(file):
assert os.path.isfile(file)
with open(file, 'r') as f:
content = f.read().split("\n")
#print content
entry = input("Enter number to be deleted:")
assert entry >= 0 and entry < len(content)
new_file = content[:entry] + content[entry+1:]
#print new_file
with open(file,'w') as f:
f.write("\n".join(new_file))
if __name__ == '__main__':
erase(sys.argv[1])
As already noted you were starting the range from 1 which is incorrect. List slicing which I used in new_file = content[:entry] + content[entry+1:] makes the code more readable and it is an approach less prone to similar errors.
Also you seem to open and close the input file at the beginning for no reason. Also you should use with if possible when doing operations with files.
Finally I used the join and split to simplify the code so you don't need a for loop to process the lines of the file.

Python: Issue when trying to read and write multiple files

This script reads and writes all the individual html files in a directory. The script reiterates, highlight and write the output.The issue is, after highlighting the last instance of the search item, the script removes all the remaining contents after the last search instance in the output of each file. Any help here is appreciated.
import os
import sys
import re
source = raw_input("Enter the source files path:")
listfiles = os.listdir(source)
for f in listfiles:
filepath = os.path.join(source+'\\'+f)
infile = open(filepath, 'r+')
source_content = infile.read()
color = ('red')
regex = re.compile(r"(\b in \b)|(\b be \b)|(\b by \b)|(\b user \b)|(\bmay\b)|(\bmight\b)|(\bwill\b)|(\b's\b)|(\bdon't\b)|(\bdoesn't\b)|(\bwon't\b)|(\bsupport\b)|(\bcan't\b)|(\bkill\b)|(\betc\b)|(\b NA \b)|(\bfollow\b)|(\bhang\b)|(\bbelow\b)", re.I)
i = 0; output = ""
for m in regex.finditer(source_content):
output += "".join([source_content[i:m.start()],
"<strong><span style='color:%s'>" % color[0:],
source_content[m.start():m.end()],
"</span></strong>"])
i = m.end()
outfile = open(filepath, 'w')
outfile.seek(0, 2)
outfile.write(output)
print "\nProcess Completed!\n"
infile.close()
outfile.close()
raw_input()
After your for loop is over, you need to include whatever is left after the last match:
...
i = m.end()
output += source_content[i:]) # Here's the end of your file
outfile = open(filepath, 'w')
...

Categories