Counting instances in a file, less or equal to - python

I cannot seem to be able to get this working. I have a txt file where in one line, there are number up to 250000 and in the other, there are numbers from 0 to 4. I want to count how many times there are instances where the number is less of equal to 50000 and has corresponding number 0. I would like to then write it to a file. For some reason, it doesnt recognize any instances where the number is less of equal to 50000.
import sys
import argparse
import operator
def main (argv):
parser = argparse.ArgumentParser(description='Get the variants that are present at least 5% of the time ')
parser.add_argument('infile', help='file to process')
parser.add_argument('outfile', help='file to produce')
args = parser.parse_args()
results =[]
c0 = int("0")
count = 0
a = int("50000")
with open(args.infile, "r") as f, open(args.outfile, "w") as of:
file_in = f.readlines()
for line in file_in:
temp = line.split()
if temp[0]<= a and temp[1]== c0:
count+=1
first_trajectory_cluster0 = str(count)
of.write(first_trajectory_cluster0 + "cluster0" + "\n")
if __name__ == "__main__":
main(sys.argv)

You're comparing a string to an integer in each case:
temp = line.split()
if temp[0]<= a and temp[1]== c0:
You need to convert:
temp = line.split()
if int(temp[0]) <= a and int(temp[1]) == c0:

Related

How to check if two different strings from two different files are present in third file?

I want to check if two different strings which are from two different files are present in third file and if they are present then write that line to fourth file. The set of strings are IPv4 addresses. I am getting empty file even if the strings are present in both files. Also I want to implement multi-threading/multiprocessing to speed up the process if possible. Thank you so much for any suggestion/help in advance.
slave_list text file has entry as below:
hostA 192.168.15.32
hostB 192.168.15.33
hostC 192.168.15.37
static_ip_list text file has entry as below:
192.168.100.10
192.168.100.12
192.168.100.14
slave_logfile has entry as below:
1536043051.176 59320 192.168.100.10 TCP_MISS/200 21830 CONNECT www.google.com:443 - 192.168.15.32
Code:
from datetime import datetime, timedelta
import os
import string
import sys
slave_list = sys.argv[1]
static_ip_list = sys.argv[2]
append_log = open('/home/top10_domain_accessed/logs/append_logs.txt', 'a')
def file_path (slave_list):
count = 1
while(count <=30):
Nth_days = datetime.now() - timedelta(days=count)
date = Nth_days.strftime("%Y%m%d")
yr_month = Nth_days.strftime("%Y/%m")
file_name = 'local2' + '.' + date
with open(slave_list) as file:
for line in file:
string = line.split()
slave_name = string[0]
slave_ip = string[1]
log_path = "/LOGS/%s/%s" %(slave_name, yr_month)
slave_logfile = os.path.join(log_path, file_name)
if os.path.exists(slave_logfile):
log_read = open(slave_logfile, 'r')
for line in log_read:
if slave_ip in line:
with open(static_ip_list) as ip_list:
for static_ip in ip_list:
static_ip = static_ip.rstrip()
if static_ip in line:
append_log.write(line + '\n')
else:
pass
count = count + 1
if __name__ == '__main__':
file_path(slave_list)

passing files and values as parameter to a function in python

I am a python newbie. I am trying to run this simple python example. I am wish to pass files and certain values as parameter to my function latcalc(). Could anyone suggest how I can pass my files and values as parameters. Or is there any better way/approach to do these things.
#!/usr/bin/python
# include the constants
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
global filename1
global filename2
global filename3
def openfiles():
filename1 = open("file1.txt", "w")
filename2 = open("file2.txt", "w")
filename3 = open("file3.txt", "w")
def latcalc(filename,target_name,vf):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
else:
target_name = target_name
myline="%s\t%s\n" % (length, target_name)
filename.write(myline)
openfiles()
latcalc(filename1,lat40,0.4)
latcalc(filename2,lat80,0.8)
latcalc(filename3,lat100,1)
I would create a little class (give it a useful name) to encapsulate your data.
If your files grow you only have to change your create_lats
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
#Little class to keep our data in one place
class Lat:
def __init__(self, filename, factor):
self.filename = filename
self.factor = factor
self.file = open(filename, "w") #let the class open the file
#now our function needs only one parameter, neat!
def latcalc(lat):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length / (lat.factor * c_vaccum)) #acces the class variable
elif length == 2:
target_name += delay
else:
target_name = target_name
myline = "%s\t%s\n" % (length, target_name)
lat.file.write(myline)
def create_lats():
lats = []
lats.append(Lat("file1.txt", 0.4))
lats.append(Lat("file2.txt", 0.8))
lats.append(Lat("file3.txt", 1))
return lats
#loop over your lats created in create_lats
for lat in create_lats():
latcalc(lat)
lat.file.close() #close the file
try something like this (notice the globals are gone):
def openfiles(namelist):
ret = []
for name in filelist:
fi = open(name, 'w')
ret.append(fi)
return ret
filelist = ['file1.txt', 'file2.txt', 'file3.txt']
handles = openfiles(filelist)
for handle in handles:
<do what ever you want>
handles will be a list of file handles corresponding to the filelist of names
note the file handle is what you pass around to do reads & writes with
also the opens could be done in the call to latcalc, since you would be doing one file per call apparently
As some comments point out, you don't need global variables and you should close your filehandler objects after you finished writing to them which is most conveniently done with 'with' (closing is done for you, even in case of an unexpected exception):
#!/usr/bin/python
min_length = 1
max_length = 3
delay = 100
c_vaccum = 3e8
def latcalc(filename, vf):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
myline="%s\t%d\n" % (length, target_name)
with open(filename, "w") as f:
f.write(myline)
return target_name
latcalc(filename1,lat40,0.4)
latcalc(filename2,lat80,0.8)
latcalc(filename3,lat100,1)
The way you treat the parameter target_name, I assume, you are used to C-type pointers which do not exist in that form in Python. The parameter is pointless here if you set it to a new value in the first line of latcalc(). Also, you seem to treat target_name as a string when it is an int:
myline="%s\t%s\n" % (length, target_name)
If you need target_name after the method has finished, you would have to return it.
1) open() gives you a filehandler, and not a filename
2) Use a "with" statement for opening a file, to avoid "forgetting" closing the file when finished.
#!/usr/bin/python
# include the constants
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
def latcalc(filename, target_name, vf):
with open(filename, "w") as openedFile:
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
else:
target_name = target_name
myline="%s\t%s\n" % (length, target_name)
openedFile.write(myline)
latcalc("file1.txt", "lat40", 0.4)
latcalc("file2.txt", "lat80", 0.8)
latcalc("file3.txt", "lat100", 1)

search pattern in sequence and report identity

I have 2 fasta files with sequence's in it.I want to align the sequences in second file to first file and report identity
For example:
File1:
>s1
aaccggactggacatccg
>s2
gtcgactctcggaattg
....
File2:
>a1
actg
>a2
tccg
.....
I want to take the file2 sequences and look in file1 and print the matching with mismatched base in uppercase and identity in csv format
Output
name,a1_alignment,a1_identity,a2_alignment,a2_identity
s1,actg,100,tccg,100
s2,aCtg,95,tcCg,95
Here what I did so far:
import sys
import os,csv
from Bio import SeqIO
from itertools import *
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-m", "--mismatch_threshold", dest="mismatch_threshold", default = 2,
help="This is the number of differences you'll allow between the actualread and your sequence of interest. Default is 2")
(options, args) = parser.parse_args()
if len(sys.argv) != 4:
print "Usage : python search.py <file1> <file2> <fout>"
sys.exit()
f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
fout = open(sys.argv[3],'w')
writer = csv.writer(fout)
def long(f1):
for record in SeqIO.parse(f1,'fasta'):
header = record.name
sequence = record.seq
yield [header, sequence]
def short(f2):
for record in SeqIO.parse(f2,'fasta'):
head = record.name
seq = record.seq
return seq
def alignment(sequence,seq,mismatch_threshold):
l1 = len(sequence)
l2 = len(seq)
alignment = []
for i in range(0,min(l1,l2)):
if sequence[i] == seq[i]:
alignment.append(i)
else:
mismatch = sum( c1 != c2 for c1,c2 in zip(sequence,seq))
if mismatch <= mismatch_threshold:
alignment.append(i)
k = 0
l = 0
for read in alignment:
for letter in read:
if letter == isupper():
pass
else:
if letter == alignment[0].seq[j]:
l +=1
k += 1
k = 0
length = seq
percent = 100*l/len(seq)
#print percent
yield percent
longsequences = long(open(sys.argv[1],'r'))
shortsequences = short(open(sys.argv[2],'r'))
align = alignment(longsequences,shortsequences,options.mismatch_threshold)
for name in head:
writer.writerow(( name +'_alignment' , name + '_identity'))
for s in align:
# print to csv file
I need help in looking the file2 sequences in file1 with mismatches and print the alignment and also in calculating the identity percentage
Error:
File "s.py", line 34, in alignment
l1 = len(sequence)
TypeError: object of type 'generator' has no len()

Cutting character values according to value from file

This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets
The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)
You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)

Searching a file for matches between two values and outputting search hits in Python

I am (attempting) to write a program that searches through a hex file for instances of a hex string between two values, eg. Between D4135B and D414AC, incrementing between the first value until the second is reached- D4135B, D4135C, D4135D etc etc.
I have managed to get it to increment etc, but it’s the search part I am having trouble with.
This is the code I have so far, it's been cobbled together from other places and I need to make it somehow output all search hits into the output file (file_out)
I have exceeded the limit of my Python understanding and I'm sure there's probably a much easier way of doing this. I would be very grateful for any help.
def search_process(hx): # searching for two binary strings
global FLAG
while threeByteHexPlusOne != threeByteHex2: #Keep incrementing until second value reached
If Flag:
if hx.find(threeByteHex2) != -1:
FLAG = False #If threeByteHex = ThreeByteHexPlusOne, end search
Print (“Reached the end of the search”,hx.find(threeByteHexPlusOne))
Else:
If hx.find(threeByteHexPlusOne) != -1:
FLAG = True
Return -1 #If no results found
if __name__ == '__main__':
try:
file_in = open(FILE_IN, "r") #opening input file
file_out = open(FILE_OUT, 'w') #opening output file
hx_read = file_in.read #read from input file
tmp = ''
found = ''
while hx_read: #reading from file till file is empty
hx_read = tmp + hx_read
pos = search_process(hx_read)
while pos != -1:
hex_read = hx_read[pos:]
if FLAG:
found = found + hx_read
pos = search_process(hx_read)
tmp = bytes_read[]
hx_read = file_in.read
file_out.write(found) #writing to output file
except IOError:
print('FILE NOT FOUND!!! Check your filename or directory/PATH')
Here's a program that looks through a hex string from a file 3 bytes at a time and if the 3-byte hex string is between the given hex bounds, it writes it to another file. It makes use of generators to make getting the bytes from the hex string a little cleaner.
import base64
import sys
_usage_string = 'Usage: python {} <input_file> <output_file>'.format(sys.argv[0])
def _to_base_10_int(value):
return int(value, 16)
def get_bytes(hex_str):
# Two characters equals one byte
for i in range(0, len(hex_str), 2):
yield hex_str[i:i+2]
def get_three_byte_hexes(hex_str):
bytes = get_bytes(hex_str)
while True:
try:
three_byte_hex = next(bytes) + next(bytes) + next(bytes)
except StopIteration:
break
yield three_byte_hex
def find_hexes_in_range(hex_str, lower_bound_hex, upper_bound_hex):
lower_bound = _to_base_10_int(lower_bound_hex)
upper_bound = _to_base_10_int(upper_bound_hex)
found = []
for three_byte_hex in get_three_byte_hexes(hex_str):
hex_value = _to_base_10_int(three_byte_hex)
if lower_bound <= hex_value < upper_bound:
found.append(three_byte_hex)
return found
if __name__ == "__main__":
try:
assert(len(sys.argv) == 3)
except AssertionError:
print _usage_string
sys.exit(2)
file_contents = open(sys.argv[1], 'rb').read()
hex_str = base64.decodestring(file_contents).encode('hex')
found = find_hexes_in_range(hex_str, 'D4135B', 'D414AC')
print('Found:')
print(found)
if found:
with open(sys.argv[2], 'wb') as fout:
for _hex in found:
fout.write(_hex)
Check out some more info on generators here

Categories