How to read the n last lines of a file? - python

I have to read the 4 last line of a file.
I tried the following:
top_tb_comp_file = open('../../ver/sim/top_tb_compile.tcl', 'r+')
top_tb_comp_end = top_tb_comp_file.readlines()[:-4]
top_tb_comp_file.close()
Didn't work (I get the first line of the file in top_tb_comp_end).

The following example opens a file named names.txt and prints the last 4 lines in the file. Applied to your example, you only need to take away the pattern given on lines 2, 5, and 7. The rest is simple.
#! /usr/bin/env python3
import collections
def main():
with open('names.txt') as file:
lines = collections.deque(file, 4)
print(*lines, sep='')
if __name__ == '__main__':
main()

Your indexing is wrong. With the [:-4], you are asking for the exact opposite of what you actually want.
Try the following:
top_tb_comp_file = open('../../ver/sim/top_tb_compile.tcl', 'r+')
top_tb_comp_end = top_tb_comp_file.readlines()[-4:]
# you noticed that the '-4' is now before the ':'
top_tb_comp_file.close()
EDIT
Thanks to #Noctis, I have made some benchmarking around the question. About the speed and memory usage of the collection.deque option and file.readlines one.
The collection option suggested by #Noctis seems to be better in term of memory usage AND speed: in my result I observed a little peak in the memory usage at the critical line file.readlines()[-4:] which did not happened at the line collections.deque(file, 4). Moreover, I repeated the speed test with the file reading phase and the collections option seems also faster in this case.
I have experienced some issues displaying the ouput of this code with the SO rendering but if you install the packages memory_profiler and psutil you should be able to see by yourself (with large sized file).
import sys
import collections
import time
from memory_profiler import profile
#profile
def coll_func(filename):
with open(filename) as file:
lines = collections.deque(file, 4)
return 0
#profile
def indexing_func(filename):
with open(filename) as file:
lines = file.readlines()[-4:]
return 0
#profile
def witness_func(filename):
with open(filename) as file:
pass
return 0
def square_star(s_toprint, ext="-"):
def surround(s, ext="+"):
return ext + s + ext
hbar = "-" * (len(s_toprint) + 1)
return (surround(hbar) + "\n"
+ surround(s_toprint, ext='|') + "\n"
+ surround(hbar))
if __name__ == '__main__':
s_fname = sys.argv[1]
s_func = sys.argv[2]
d_func = {
"1": coll_func,
"2": indexing_func,
"3": witness_func
}
func = d_func[s_func]
start = time.time()
func(s_fname)
elapsed_time = time.time() - start
s_toprint = square_star("Elapsed time:\t{}".format(elapsed_time))
print(s_toprint)
Just type the following:
python3 -m memory_profiler profile.py "my_file.txt" n
n being 1, 2 or 3.

Related

Is the for loop in my code the speed bottleneck?

The following code looks through 2500 markdown files with a total of 76475 lines, to check each one for the presence of two strings.
#!/usr/bin/env python3
# encoding: utf-8
import re
import os
zettelkasten = '/Users/will/Dropbox/zettelkasten'
def zsearch(s, *args):
for x in args:
r = (r"(?=.* " + x + ")")
p = re.search(r, s, re.IGNORECASE)
if p is None:
return None
return s
for filename in os.listdir(zettelkasten):
if filename.endswith('.md'):
with open(os.path.join(zettelkasten, filename),"r") as fp:
for line in fp:
result_line = zsearch(line, "COVID", "vaccine")
if result_line != None:
UUID = filename[-15:-3]
print(f'›[[{UUID}]] OR', end=" ")
This correctly gives output like:
›[[202202121717]] OR ›[[202003311814]] OR
, but it takes almost two seconds to run on my machine, which I think is much too slow. What, if anything, can be done to make it faster?
The main bottleneck is the regular expressions you're building.
If we print(f"{r=}") inside the zsearch function:
>>> zsearch("line line covid line", "COVID", "vaccine")
r='(?=.* COVID)'
r='(?=.* vaccine)'
The (?=.*) lookahead is what is causing the slowdown - and it's also not needed.
You can achieve the same result by searching for:
r=' COVID'
r=' vaccine'

Python text file manipulation, add delta time to each line in seconds

I am a beginner at python and trying to solve the below:
I have a text file that each line starts like this:
<18:12:53.972>
<18:12:53.975>
<18:12:53.975>
<18:12:53.975>
<18:12:54.008>
etc
Instead of above I would like to add the elapsed time in seconds in the beginning of each line, but only if the line starts with '<'.
<0.0><18:12:53.972>
<0.003><18:12:53.975>
<0.003><18:12:53.975>
<0.003><18:12:53.975>
<0.036><18:12:54.008>
etc
Here comes a try :-)
#import datetime
from datetime import timedelta
from sys import argv
#get filename as argument
run, input, output = argv
#get number of lines for textfile
nr_of_lines = sum(1 for line in open(input))
#read in file
f = open(input)
lines = f.readlines()
f.close
#declarations
do_once = True
time = []
delta_to_list = []
i = 0
#read in and translate all timevalues from logfile to delta time.
while i < nr_of_lines:
i += 1
if lines[i-1].startswith('<'):
get_lines = lines[i-1] #get one line
get_time = (get_lines[1:13]) #get the time from that line
h = int(get_time[0:2])
m = int(get_time[3:5])
s = int(get_time[6:8])
ms = int(get_time[9:13])
time = timedelta(hours = h, minutes = m, seconds = s, microseconds = 0, milliseconds = ms)
sec_time = time.seconds + (ms/1000)
if do_once:
start_value = sec_time
do_once = False
delta = float("{0:.3f}".format(sec_time - start_value))
delta_to_list.append(delta)
#write back values to logfile.
k=0
s = str(delta_to_list[k])
with open(output, 'w') as out_file:
with open(input, 'r') as in_file:
for line in in_file:
if line.startswith('<'):
s = str(delta_to_list[k])
out_file.write("<" + s + ">" + line)
else:
out_file.write(line)
k += 1
As it is now, it works fine, but the last two lines is not written to the new file. It says: "s = str(delta_to_list[k]) IndexError: list index out of range.
At first I would like to get my code working, and second a suggestions for improvements. Thank you!
First point: never read a full file in memory when you don't have too (and specially when you don't know whether you have enough free memory).
Second point: learn to use python's for loop and iteration protocol. The way to iterate over a list and any other iterable is:
for item in some_iterable:
do_something_with(item)
This avoids messing with indexes and getting it wrong ;)
One of the nice things with Python file objects is that they actually are iterables, so to iterate over a file lines, the simplest way is:
for line in my_opened_file:
do_something_with(line)
Here's a simple yet working and mostly pythonic (nb: python 2.7.x) way to write your program:
# -*- coding: utf-8 -*-
import os
import sys
import datetime
import re
import tempfile
def totime(timestr):
""" returns a datetime object for a "HH:MM:SS" string """
# we actually need datetime objects for substraction
# so let's use the first available bogus date
# notes:
# `timestr.split(":")` will returns a list `["MM", "HH", "SS]`
# `map(int, ...)` will apply `int()` on each item
# of the sequence (second argument) and return
# the resulting list, ie
# `map(int, "01", "02", "03")` => `[1, 2, 3]`
return datetime.datetime(1900, 1, 1, *map(int, timestr.split(":")))
def process(instream, outstream):
# some may consider that regexps are not that pythonic
# but as far as I'm concerned it seems like a sensible
# use case.
time_re = re.compile("^<(?P<time>\d{2}:\d{2}:\d{2})\.")
first = None
# iterate over our input stream lines
for line in instream:
# should we handle this line at all ?
# (nb a bit redundant but faster than re.match)
if not line.startswith("<"):
continue
# looks like a candidate, let's try and
# extract the 'time' value from it
match = time_re.search(line)
if not match:
# starts with '<' BUT not followed by 'HH:MM:SS.' ?
# unexpected from the sample source but well, we
# can't do much about it either
continue
# retrieve the captured "time" (HH:MM:SS) part
current = totime(match.group("time"))
# store the first occurrence so we can
# compute the elapsed time
if first is None:
first = current
# `(current - first)` yields a `timedelta` object
# we now just have to retrieve it's `seconds` attribute
seconds = (current - first).seconds
# inject the seconds before the line
# and write the whole thing tou our output stream
newline = "{}{}".format(seconds, line)
outstream.write(newline)
def usage(err=None):
if err:
print >> sys.stderr, err
print >> sys.stderr, "usage: python retime.py <filename>"
# unix standards process exit codes
return 2 if err else 0
def main(*args):
# our entry point...
# gets the source filename, process it
# (storing the results in a temporary file),
# and if everything's ok replace the source file
# by the temporary file.
try:
sourcename = args[0]
except IndexError as e:
return usage("missing <filename> argument")
# `delete=False` prevents the tmp file to be
# deleted on closing.
dest = tempfile.NamedTemporaryFile(delete=False)
with open(sourcename) as source:
try:
process(source, dest)
except Exception as e:
dest.close()
os.remove(dest)
raise
# ok done
dest.close()
os.rename(dest.name, sourcename)
return 0
if __name__ == "__main__":
# only execute main() if we are called as a script
# (so we can also import this file as a module)
sys.exit(main(*sys.argv[1:]))
It gives the expected results on your sample data (running on linux - but it should be ok on any other supported OS afaict).
Note that I wrote it to work like your original code (replace the source file with the processed one), but if it were my code I would instead either explicitely provide a destination filename or as a default write to sys.stdout instead (and redirect stdout to another file). The process function can deal with any of those solution FWIW - it's only a matter of a couple edits in main().

Python script for parsing ldap logs for getting Searcches/Binds [duplicate]

I am writing a script a in python to parse ldap logs and then get the number of searches/binds by each user. I was testing my code on sample files and for smaller files till size of 5-10MB it runs quick and completes within a 1 minute on my local PC. However when i ran the script on a file worth 18M having around 150000 lines in it, it takes around 5 minutes, i want to run this script on file sizes of 100M and maybe be 5-6 files in each run so that means script has to parse almost of 600-700M of data in each run. But i suppose it would take long time to run, so i would need some advise from you guys if my below code can be fine tuned for better performance in terms of execution time.
import os,re,datetime
from collections import defaultdict
d=defaultdict(list)
k=defaultdict(list)
start_time=datetime.datetime.now()
fh = open("C:\\Rohit\\ECD Utilization Script - Copy\\logdir\\access","r").read()
pat=re.compile(' BIND REQ .*conn=([\d]*).*dn=(.*")')
srchStr='\n'.join(re.findall(r' SEARCH REQ .*',fh))
bindlist=re.findall(pat,fh)
for entry in bindlist:
d[entry[-1].split(",")[0]].append(entry[0])
for key in d:
for con in d[key]:
count = re.findall(con,srchStr)
k[key].append((con,len(count)))
#
for key in k:
print("Number of searches by ",key, " : ",sum([i[1] for i in k[key]]))
for key in d:
print("No of bind ",key," = ",len(d[key]))
end_time=datetime.datetime.now()
print("Total time taken - {}".format(end_time-start_time))
You are doing several scans on entire file on the line
count = re.findall('SEARCH REQ.*'+conid,fh1)
Avoid this. This is your major problem. Get all conids in a list and iterate on file again and list while your inner loop should consist of conids. Bring it out of outer loop. You will be doing two scans of file.
Also since it is plain Python run with PyPy for faster runs.
You can do this better with a FSM and by spending a bit more RAM. This is a hint and you will have to do your FSM yourself.
Edit 1: This is the version of script I wrote after seeing the log file. Please correct if there is any mistake:
#!/usr/bin/env python
import sys
import re
def parse(filepath):
d = {}
regex1 = re.compile(r'(.*)?BIND\sREQ(.*)uid=(\w+)')
regex2 = re.compile(r'(.*)?SEARCH\sREQ(.*)uid=(\w+)')
with open(filepath, 'r') as f:
for l in f:
m = re.search(regex1, l)
if m:
# print (m.group(3))
uid = m.group(3)
if uid in d:
d[uid]['bind_count'] += 1
else:
d[uid] = {}
d[uid]['bind_count'] = 1
d[uid]['search_count'] = 0
m = re.search(regex2, l)
if m:
# print (m.group(3))
uid = m.group(3)
if uid in d:
d[uid]['search_count'] += 1
else:
d[uid] = {}
d[uid]['search_count'] = 1
d[uid]['bind_count'] = 0
for k in d:
print('user id = ' + k, 'Bind count = ' + str(d[k]['bind_count']), 'Search count = ' + str(d[k]['search_count']))
def process_args():
if sys.argv < 2:
print('Usage: parse_ldap_log.py log_filepath')
exit(1)
if __name__ == '__main__':
process_args()
parse(sys.argv[1])
Thank the Gods that it was not complicated enough to warrant an FSM.
Use itertools library instead of so many loops.
Your script has a quadratic complexity: for each line in the file you are making a read again to match the log entry.
My suggestion is to read the file only one time and counting the occurrences of the needed entry (the one matching (" BIND REQ ")).
I was able to solve my problem with below code.
import os,re,datetime
from collections import defaultdict
start_time=datetime.datetime.now()
bind_count=defaultdict(int)
search_conn=defaultdict(int)
bind_conn=defaultdict(str)
j=defaultdict(int)
fh = open("C:\\access","r")
total_searches=0
total_binds=0
for line in fh:
reg1=re.search(r' BIND REQ .*conn=(\d+).*dn=(.*")', line)
reg2=re.search(r' SEARCH REQ .*conn=(\d+).*', line)
if reg1:
total_binds+=1
uid,con=reg1.group(2,1)
bind_count[uid]=bind_count[uid]+1
bind_conn[con]=uid
if reg2:
total_searches+=1
skey=reg2.group(1)
search_conn[skey] = search_conn[skey]+1
for conid in search_conn:
if conid in bind_conn:
new_key=bind_conn[conid]
j[new_key]=j[new_key]+search_conn[conid]
for k,v in bind_count.items():
print(k," = ",v)
print("*"*80)
for k,v in j.items():
print(k,"-->",v)
fh.close()
del search_conn
del bind_conn
end_time=datetime.datetime.now()
print("Total time taken - {}".format(end_time-start_time))

python script taking long time to run

I am writing a script a in python to parse ldap logs and then get the number of searches/binds by each user. I was testing my code on sample files and for smaller files till size of 5-10MB it runs quick and completes within a 1 minute on my local PC. However when i ran the script on a file worth 18M having around 150000 lines in it, it takes around 5 minutes, i want to run this script on file sizes of 100M and maybe be 5-6 files in each run so that means script has to parse almost of 600-700M of data in each run. But i suppose it would take long time to run, so i would need some advise from you guys if my below code can be fine tuned for better performance in terms of execution time.
import os,re,datetime
from collections import defaultdict
d=defaultdict(list)
k=defaultdict(list)
start_time=datetime.datetime.now()
fh = open("C:\\Rohit\\ECD Utilization Script - Copy\\logdir\\access","r").read()
pat=re.compile(' BIND REQ .*conn=([\d]*).*dn=(.*")')
srchStr='\n'.join(re.findall(r' SEARCH REQ .*',fh))
bindlist=re.findall(pat,fh)
for entry in bindlist:
d[entry[-1].split(",")[0]].append(entry[0])
for key in d:
for con in d[key]:
count = re.findall(con,srchStr)
k[key].append((con,len(count)))
#
for key in k:
print("Number of searches by ",key, " : ",sum([i[1] for i in k[key]]))
for key in d:
print("No of bind ",key," = ",len(d[key]))
end_time=datetime.datetime.now()
print("Total time taken - {}".format(end_time-start_time))
You are doing several scans on entire file on the line
count = re.findall('SEARCH REQ.*'+conid,fh1)
Avoid this. This is your major problem. Get all conids in a list and iterate on file again and list while your inner loop should consist of conids. Bring it out of outer loop. You will be doing two scans of file.
Also since it is plain Python run with PyPy for faster runs.
You can do this better with a FSM and by spending a bit more RAM. This is a hint and you will have to do your FSM yourself.
Edit 1: This is the version of script I wrote after seeing the log file. Please correct if there is any mistake:
#!/usr/bin/env python
import sys
import re
def parse(filepath):
d = {}
regex1 = re.compile(r'(.*)?BIND\sREQ(.*)uid=(\w+)')
regex2 = re.compile(r'(.*)?SEARCH\sREQ(.*)uid=(\w+)')
with open(filepath, 'r') as f:
for l in f:
m = re.search(regex1, l)
if m:
# print (m.group(3))
uid = m.group(3)
if uid in d:
d[uid]['bind_count'] += 1
else:
d[uid] = {}
d[uid]['bind_count'] = 1
d[uid]['search_count'] = 0
m = re.search(regex2, l)
if m:
# print (m.group(3))
uid = m.group(3)
if uid in d:
d[uid]['search_count'] += 1
else:
d[uid] = {}
d[uid]['search_count'] = 1
d[uid]['bind_count'] = 0
for k in d:
print('user id = ' + k, 'Bind count = ' + str(d[k]['bind_count']), 'Search count = ' + str(d[k]['search_count']))
def process_args():
if sys.argv < 2:
print('Usage: parse_ldap_log.py log_filepath')
exit(1)
if __name__ == '__main__':
process_args()
parse(sys.argv[1])
Thank the Gods that it was not complicated enough to warrant an FSM.
Use itertools library instead of so many loops.
Your script has a quadratic complexity: for each line in the file you are making a read again to match the log entry.
My suggestion is to read the file only one time and counting the occurrences of the needed entry (the one matching (" BIND REQ ")).
I was able to solve my problem with below code.
import os,re,datetime
from collections import defaultdict
start_time=datetime.datetime.now()
bind_count=defaultdict(int)
search_conn=defaultdict(int)
bind_conn=defaultdict(str)
j=defaultdict(int)
fh = open("C:\\access","r")
total_searches=0
total_binds=0
for line in fh:
reg1=re.search(r' BIND REQ .*conn=(\d+).*dn=(.*")', line)
reg2=re.search(r' SEARCH REQ .*conn=(\d+).*', line)
if reg1:
total_binds+=1
uid,con=reg1.group(2,1)
bind_count[uid]=bind_count[uid]+1
bind_conn[con]=uid
if reg2:
total_searches+=1
skey=reg2.group(1)
search_conn[skey] = search_conn[skey]+1
for conid in search_conn:
if conid in bind_conn:
new_key=bind_conn[conid]
j[new_key]=j[new_key]+search_conn[conid]
for k,v in bind_count.items():
print(k," = ",v)
print("*"*80)
for k,v in j.items():
print(k,"-->",v)
fh.close()
del search_conn
del bind_conn
end_time=datetime.datetime.now()
print("Total time taken - {}".format(end_time-start_time))

Piping my Python Program through another program

I'm trying to make program using Python.
I want to be able to pipe program through another program:
" #EXAMPLE " ./my_python | another programme "
Here is the code I have so far.
This code saves output to file:
#!/usr/bin/env python
import os, random, string
# This is not my own code
''' As far asi know, It belongs to NullUserException. Was found on stackoverflow.com'''
length = 8
chars = string.ascii_letters.upper()+string.digits
random.seed = (os.urandom(1024))
# my code
file_out = open('newRa.txt','w') # Create a 'FILE' to save Generated Passwords
list1=[]
while len(list1) < 100000:
list1.append(''.join(random.choice(chars) for i in range(length)))
for item in list1:
file_out.write('%s\n' % item)
file_out.close()
file_out1=open('test.txt','w')
for x in list1:
file_out1.write('%s\n' %x[::-1])
This is the code I have trying to pipe it through another program:
#!/usr/bin/env python
import os,string,random,sys
length = 8
chars = string.ascii_letters.upper()+string.digits
random.seed = (os.urandom(1024))
keep=[]
keep1=[]
while len(keep)<1000:
keep.append(''.join(random.choice(chars) for i in range(length)))
print '\n',keep[::-1]
for x in keep:
keep1.append(x[::-1])
while len(keep1) < 1000:
print keep1
I have tried chmod and using the script as a executable.
Ok sorry for my lack of google search.
sys.stdout is the answer
#!/usr/bin/env python
import os,string,random,sys
length = 8
chars = string.ascii_letters.upper()+string.digits
random.seed = (os.urandom(1024))
keep=[]
while len(keep)<1000:
keep = (''.join(random.choice(chars) for i in range(length)))
print sys.stdout.write(keep)
sys.stdout.flush()
I stripped my code down (as it makes it a lot faster, But I'm getting this when execute
my code........
P5DBLF4KNone
DVFV3JQVNone
CIMKZFP0None
UZ1QA3HTNone
How do I get rid of the 'None' on the end?
What I have done to cause this ?
Should This Be A Seperate Question??

Categories