I am a python newbie. I am trying to run this simple python example. I am wish to pass files and certain values as parameter to my function latcalc(). Could anyone suggest how I can pass my files and values as parameters. Or is there any better way/approach to do these things.
#!/usr/bin/python
# include the constants
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
global filename1
global filename2
global filename3
def openfiles():
filename1 = open("file1.txt", "w")
filename2 = open("file2.txt", "w")
filename3 = open("file3.txt", "w")
def latcalc(filename,target_name,vf):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
else:
target_name = target_name
myline="%s\t%s\n" % (length, target_name)
filename.write(myline)
openfiles()
latcalc(filename1,lat40,0.4)
latcalc(filename2,lat80,0.8)
latcalc(filename3,lat100,1)
I would create a little class (give it a useful name) to encapsulate your data.
If your files grow you only have to change your create_lats
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
#Little class to keep our data in one place
class Lat:
def __init__(self, filename, factor):
self.filename = filename
self.factor = factor
self.file = open(filename, "w") #let the class open the file
#now our function needs only one parameter, neat!
def latcalc(lat):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length / (lat.factor * c_vaccum)) #acces the class variable
elif length == 2:
target_name += delay
else:
target_name = target_name
myline = "%s\t%s\n" % (length, target_name)
lat.file.write(myline)
def create_lats():
lats = []
lats.append(Lat("file1.txt", 0.4))
lats.append(Lat("file2.txt", 0.8))
lats.append(Lat("file3.txt", 1))
return lats
#loop over your lats created in create_lats
for lat in create_lats():
latcalc(lat)
lat.file.close() #close the file
try something like this (notice the globals are gone):
def openfiles(namelist):
ret = []
for name in filelist:
fi = open(name, 'w')
ret.append(fi)
return ret
filelist = ['file1.txt', 'file2.txt', 'file3.txt']
handles = openfiles(filelist)
for handle in handles:
<do what ever you want>
handles will be a list of file handles corresponding to the filelist of names
note the file handle is what you pass around to do reads & writes with
also the opens could be done in the call to latcalc, since you would be doing one file per call apparently
As some comments point out, you don't need global variables and you should close your filehandler objects after you finished writing to them which is most conveniently done with 'with' (closing is done for you, even in case of an unexpected exception):
#!/usr/bin/python
min_length = 1
max_length = 3
delay = 100
c_vaccum = 3e8
def latcalc(filename, vf):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
myline="%s\t%d\n" % (length, target_name)
with open(filename, "w") as f:
f.write(myline)
return target_name
latcalc(filename1,lat40,0.4)
latcalc(filename2,lat80,0.8)
latcalc(filename3,lat100,1)
The way you treat the parameter target_name, I assume, you are used to C-type pointers which do not exist in that form in Python. The parameter is pointless here if you set it to a new value in the first line of latcalc(). Also, you seem to treat target_name as a string when it is an int:
myline="%s\t%s\n" % (length, target_name)
If you need target_name after the method has finished, you would have to return it.
1) open() gives you a filehandler, and not a filename
2) Use a "with" statement for opening a file, to avoid "forgetting" closing the file when finished.
#!/usr/bin/python
# include the constants
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
def latcalc(filename, target_name, vf):
with open(filename, "w") as openedFile:
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
else:
target_name = target_name
myline="%s\t%s\n" % (length, target_name)
openedFile.write(myline)
latcalc("file1.txt", "lat40", 0.4)
latcalc("file2.txt", "lat80", 0.8)
latcalc("file3.txt", "lat100", 1)
Related
I hope I am not downvoted this time. I have been struggling with parallel processing in Python for a while(2 days , exactly). I have checking these resources(a partial list is shown here:
(a) http://eli.thegreenplace.net/2013/01/16/python-paralellizing-cpu-bound-tasks-with-concurrent-futures
(b) https://pythonadventures.wordpress.com/tag/processpoolexecutor/
I came unstuck. What I want to do is this:
Master:
Break up the file into chunks(strings or numbers)
Broadcast a pattern to be searched to all the workers
Receive the offsets in the file where the pattern was found
Workers:
Receive pattern and chunk of text from the master
Compute()
Send back the offsets to the master.
I tried to implement this using MPI/concurrent.futures/multiprocessing and came unstuck.
My naive implementation using multiprocessing module
import multiprocessing
filename = "file1.txt"
pat = "afow"
N = 1000
""" This is the naive string search algorithm"""
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
offsets.append(i)
return str(offsets).strip('[]')
""""
This is what I want
if __name__ == "__main__":
tasks = []
pool_outputs = []
pool = multiprocessing.Pool(processes=5)
with open(filename, 'r') as infile:
lines = []
for line in infile:
lines.append(line.rstrip())
if len(lines) > N:
pool_output = pool.map(search, tasks)
pool_outputs.append(pool_output)
lines = []
if len(lines) > 0:
pool_output = pool.map(search, tasks)
pool_outputs.append(pool_output)
pool.close()
pool.join()
print('Pool:', pool_outputs)
"""""
with open(filename, 'r') as infile:
for line in infile:
print(search(pat, line))
I would be grateful for any guidance especially with the concurrent.futures. Thanks for your time. Valeriy helped me with his addition and I thank him for that.
But if anyone could just indulge me for a moment, this is the code I was working on for the concurrent.futures(working off an example I saw somewhere)
from concurrent.futures import ProcessPoolExecutor, as_completed
import math
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
offsets.append(i)
return str(offsets).strip('[]')
#Check a list of strings
def chunked_worker(lines):
return {0: search("fmo", line) for line in lines}
def pool_bruteforce(filename, nprocs):
lines = []
with open(filename) as f:
lines = [line.rstrip('\n') for line in f]
chunksize = int(math.ceil(len(lines) / float(nprocs)))
futures = []
with ProcessPoolExecutor() as executor:
for i in range(nprocs):
chunk = lines[(chunksize * i): (chunksize * (i + 1))]
futures.append(executor.submit(chunked_worker, chunk))
resultdict = {}
for f in as_completed(futures):
resultdict.update(f.result())
return resultdict
filename = "file1.txt"
pool_bruteforce(filename, 5)
Thanks again , Valeriy and anyone who attempts to help me solve my riddle.
You are using several arguments, so:
import multiprocessing
from functools import partial
filename = "file1.txt"
pat = "afow"
N = 1000
""" This is the naive string search algorithm"""
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
offsets.append(i)
return str(offsets).strip('[]')
if __name__ == "__main__":
tasks = []
pool_outputs = []
pool = multiprocessing.Pool(processes=5)
lines = []
with open(filename, 'r') as infile:
for line in infile:
lines.append(line.rstrip())
tasks = lines
func = partial(search, pat)
if len(lines) > N:
pool_output = pool.map(func, lines )
pool_outputs.append(pool_output)
elif len(lines) > 0:
pool_output = pool.map(func, lines )
pool_outputs.append(pool_output)
pool.close()
pool.join()
print('Pool:', pool_outputs)
This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets
The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)
You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)
I am (attempting) to write a program that searches through a hex file for instances of a hex string between two values, eg. Between D4135B and D414AC, incrementing between the first value until the second is reached- D4135B, D4135C, D4135D etc etc.
I have managed to get it to increment etc, but it’s the search part I am having trouble with.
This is the code I have so far, it's been cobbled together from other places and I need to make it somehow output all search hits into the output file (file_out)
I have exceeded the limit of my Python understanding and I'm sure there's probably a much easier way of doing this. I would be very grateful for any help.
def search_process(hx): # searching for two binary strings
global FLAG
while threeByteHexPlusOne != threeByteHex2: #Keep incrementing until second value reached
If Flag:
if hx.find(threeByteHex2) != -1:
FLAG = False #If threeByteHex = ThreeByteHexPlusOne, end search
Print (“Reached the end of the search”,hx.find(threeByteHexPlusOne))
Else:
If hx.find(threeByteHexPlusOne) != -1:
FLAG = True
Return -1 #If no results found
if __name__ == '__main__':
try:
file_in = open(FILE_IN, "r") #opening input file
file_out = open(FILE_OUT, 'w') #opening output file
hx_read = file_in.read #read from input file
tmp = ''
found = ''
while hx_read: #reading from file till file is empty
hx_read = tmp + hx_read
pos = search_process(hx_read)
while pos != -1:
hex_read = hx_read[pos:]
if FLAG:
found = found + hx_read
pos = search_process(hx_read)
tmp = bytes_read[]
hx_read = file_in.read
file_out.write(found) #writing to output file
except IOError:
print('FILE NOT FOUND!!! Check your filename or directory/PATH')
Here's a program that looks through a hex string from a file 3 bytes at a time and if the 3-byte hex string is between the given hex bounds, it writes it to another file. It makes use of generators to make getting the bytes from the hex string a little cleaner.
import base64
import sys
_usage_string = 'Usage: python {} <input_file> <output_file>'.format(sys.argv[0])
def _to_base_10_int(value):
return int(value, 16)
def get_bytes(hex_str):
# Two characters equals one byte
for i in range(0, len(hex_str), 2):
yield hex_str[i:i+2]
def get_three_byte_hexes(hex_str):
bytes = get_bytes(hex_str)
while True:
try:
three_byte_hex = next(bytes) + next(bytes) + next(bytes)
except StopIteration:
break
yield three_byte_hex
def find_hexes_in_range(hex_str, lower_bound_hex, upper_bound_hex):
lower_bound = _to_base_10_int(lower_bound_hex)
upper_bound = _to_base_10_int(upper_bound_hex)
found = []
for three_byte_hex in get_three_byte_hexes(hex_str):
hex_value = _to_base_10_int(three_byte_hex)
if lower_bound <= hex_value < upper_bound:
found.append(three_byte_hex)
return found
if __name__ == "__main__":
try:
assert(len(sys.argv) == 3)
except AssertionError:
print _usage_string
sys.exit(2)
file_contents = open(sys.argv[1], 'rb').read()
hex_str = base64.decodestring(file_contents).encode('hex')
found = find_hexes_in_range(hex_str, 'D4135B', 'D414AC')
print('Found:')
print(found)
if found:
with open(sys.argv[2], 'wb') as fout:
for _hex in found:
fout.write(_hex)
Check out some more info on generators here
I want to create a python program which splits up a files into segments of specified width, and then a consumer program takes the segments and creates a duplicate of the original file. The segments might be out of order so I intent to use the offset value to write to the file.
Is there a way I can achieve this with without creating a local array to hold all the data on the receiving end?
for example,
f = open(file, "wb")
f.seek(offset)
f.write(data)
The idea behind this is that the program that sends the file might not be able to finish sending the file, and will resume again once it has started.
I have a sample code below which the "combine_bytes" function throws an exception when I try placing data in the buffer location.
import sys
import os
def SplitFile(fname, start, end, width):
t_fileSize = os.path.getsize(fname)
buffData = bytearray(t_fileSize)
for line, offset in get_bytes(fname, int(start), int(end), int(width)):
combine_bytes(buffData, offset, line, width)
nums = ["%02x" % ord(c) for c in line]
print " ".join(nums)
f = open("Green_copy.jpg", "wb")
f.write(buffData)
f.close()
def combine_bytes(in_buff, in_offset, in_data, in_width):
#something like memcpy would be nice
#in_buff[in_offset:in_offset + in_width] = in_data
#this works but it's the mother of inefficiency
i = in_offset
for c in in_data:
in_buff.insert(i, c)
i = i + 1
def get_bytes(fname, start, end, width):
t_currOffset = start
t_width = width
f = open(fname, "r+b")
if end != 0:
while t_currOffset < end:
f.seek(t_currOffset)
if (t_currOffset + t_width) > end:
t_width = end - t_currOffset
t_data = f.read(t_width)
yield t_data,t_currOffset
t_currOffset += t_width
else:
f.seek(t_currOffset)
t_data = f.read(t_width)
while t_data:
yield t_data, t_currOffset
t_currOffset += t_width
f.seek(t_currOffset)
t_data = f.read(t_width)
f.close()
if __name__ == '__main__':
try:
SplitFile(*sys.argv[1:5])
except:
print "Unexpected error:", sys.exc_info()[0]
I still could nt figure out what is your intent - but this version of combine_bytes will get rid of your "mother of your inefficiency" part (which actually is exactly that)
def combine_bytes(in_buff, in_offset, in_data, in_width):
#something like memcpy would be nice
#in_buff[in_offset:in_offset + in_width] = in_data
in_buff = in_buff[:in_offset] + in_data + in_buff[in_offset:]
return in_buff
Of course this creates a new (larger) buffer for each call, and you have to replace your buffer on the caller scope with the one returned:
buffData = combine_bytes(buffData, offset, line, width)
Found it. here is a better way which produces the what I wanted and is faster. _buffData[t_offset:t_offset + len(t_data)] = bytearray(t_data)
How to implement somethig like the 'head' and 'tail' commands in python and backward read by lines of a text file?
This is my personal file class ;-)
class File(file):
""" An helper class for file reading """
def __init__(self, *args, **kwargs):
super(File, self).__init__(*args, **kwargs)
self.BLOCKSIZE = 4096
def head(self, lines_2find=1):
self.seek(0) #Rewind file
return [super(File, self).next() for x in xrange(lines_2find)]
def tail(self, lines_2find=1):
self.seek(0, 2) #Go to end of file
bytes_in_file = self.tell()
lines_found, total_bytes_scanned = 0, 0
while (lines_2find + 1 > lines_found and
bytes_in_file > total_bytes_scanned):
byte_block = min(
self.BLOCKSIZE,
bytes_in_file - total_bytes_scanned)
self.seek( -(byte_block + total_bytes_scanned), 2)
total_bytes_scanned += byte_block
lines_found += self.read(self.BLOCKSIZE).count('\n')
self.seek(-total_bytes_scanned, 2)
line_list = list(self.readlines())
return line_list[-lines_2find:]
def backward(self):
self.seek(0, 2) #Go to end of file
blocksize = self.BLOCKSIZE
last_row = ''
while self.tell() != 0:
try:
self.seek(-blocksize, 1)
except IOError:
blocksize = self.tell()
self.seek(-blocksize, 1)
block = self.read(blocksize)
self.seek(-blocksize, 1)
rows = block.split('\n')
rows[-1] = rows[-1] + last_row
while rows:
last_row = rows.pop(-1)
if rows and last_row:
yield last_row
yield last_row
Example usage:
with File('file.name') as f:
print f.head(5)
print f.tail(5)
for row in f.backward():
print row
head is easy:
from itertools import islice
with open("file") as f:
for line in islice(f, n):
print line
tail is harder if you don't want to keep the whole file in memory. If the input is a file, you could start reading blocks beginning at the end of the file. The original tail also works if the input is a pipe, so a more general solution is to read and discard the whole input, except for the last few lines. An easy way to do this is collections.deque:
from collections import deque
with open("file") as f:
for line in deque(f, maxlen=n):
print line
In both these code snippets, n is the number of lines to print.
Tail:
def tail(fname, lines):
"""Read last N lines from file fname."""
f = open(fname, 'r')
BUFSIZ = 1024
f.seek(0, os.SEEK_END)
fsize = f.tell()
block = -1
data = ""
exit = False
while not exit:
step = (block * BUFSIZ)
if abs(step) >= fsize:
f.seek(0)
exit = True
else:
f.seek(step, os.SEEK_END)
data = f.read().strip()
if data.count('\n') >= lines:
break
else:
block -= 1
return data.splitlines()[-lines:]