I have 2 fasta files with sequence's in it.I want to align the sequences in second file to first file and report identity
For example:
File1:
>s1
aaccggactggacatccg
>s2
gtcgactctcggaattg
....
File2:
>a1
actg
>a2
tccg
.....
I want to take the file2 sequences and look in file1 and print the matching with mismatched base in uppercase and identity in csv format
Output
name,a1_alignment,a1_identity,a2_alignment,a2_identity
s1,actg,100,tccg,100
s2,aCtg,95,tcCg,95
Here what I did so far:
import sys
import os,csv
from Bio import SeqIO
from itertools import *
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-m", "--mismatch_threshold", dest="mismatch_threshold", default = 2,
help="This is the number of differences you'll allow between the actualread and your sequence of interest. Default is 2")
(options, args) = parser.parse_args()
if len(sys.argv) != 4:
print "Usage : python search.py <file1> <file2> <fout>"
sys.exit()
f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
fout = open(sys.argv[3],'w')
writer = csv.writer(fout)
def long(f1):
for record in SeqIO.parse(f1,'fasta'):
header = record.name
sequence = record.seq
yield [header, sequence]
def short(f2):
for record in SeqIO.parse(f2,'fasta'):
head = record.name
seq = record.seq
return seq
def alignment(sequence,seq,mismatch_threshold):
l1 = len(sequence)
l2 = len(seq)
alignment = []
for i in range(0,min(l1,l2)):
if sequence[i] == seq[i]:
alignment.append(i)
else:
mismatch = sum( c1 != c2 for c1,c2 in zip(sequence,seq))
if mismatch <= mismatch_threshold:
alignment.append(i)
k = 0
l = 0
for read in alignment:
for letter in read:
if letter == isupper():
pass
else:
if letter == alignment[0].seq[j]:
l +=1
k += 1
k = 0
length = seq
percent = 100*l/len(seq)
#print percent
yield percent
longsequences = long(open(sys.argv[1],'r'))
shortsequences = short(open(sys.argv[2],'r'))
align = alignment(longsequences,shortsequences,options.mismatch_threshold)
for name in head:
writer.writerow(( name +'_alignment' , name + '_identity'))
for s in align:
# print to csv file
I need help in looking the file2 sequences in file1 with mismatches and print the alignment and also in calculating the identity percentage
Error:
File "s.py", line 34, in alignment
l1 = len(sequence)
TypeError: object of type 'generator' has no len()
Related
I am trying to save this dict code into a .csv file, with one column for the key and one column for the frequency counted. But I when I try to run the code below, I get an error at the end for line 6. What am I doing wrong?
encryptedA_edited = abcfreq(encryptedA)
import csv
with open("EncryptA_Edited.csv", "w", newline="") as ECA:
writer = csv.writer(ECA)
writer.writerows(encryptedA_edited.items())
'NoneType' object has no attribute 'items'
This is my function:
import string
encryptedA = open("encryptedA.txt")
encryptedA = encryptedA.read()
encryptedB = open("encryptedB.txt")
encryptedB = encryptedB.read()
def abcfreq(lettervalues):
lettervalues = lettervalues.lower().strip()
freq = {}
j =0
for x in string.ascii_lowercase:
freq[x] = 0
for j in lettervalues:
if j in freq:
freq[j] += 1
for key, value in (freq.items()):
print("%s:%d" % (key, value))
max_value = max(freq.values())
max_key = max(freq, key =freq.get)
print("The letter with the highest frequency is " + max_key +":"+str(max_value))
This should be what you're looking for.
Added a few functions for your convenience and shows how to view a file once it's been created
from random import choice, randint
import csv, os, string
def freq(element,iterable) -> int:
return sum(1 for i in iterable if i==element)
def sample(sequence,length):
'''
Just for "generating" test strings
breaks if the sequence does not have a __len__ attribute
'''
for i in range(length):
yield choice(sequence)
def ascii(omissions:str='',include:bool=False) -> str:
"""
A convenient ascii character set
Return an ascii character set excluding the given omissions:
"p" -> ' ' + punctuation
"u" -> uppercase
"l" -> lowercase
"d" -> digits
Feel free to omit combinations:
>>> ascii('lup')
... 0123456789
or include them
>>> ascii('d',True)
... 0123456789
"""
d = {
"p":" "+string.punctuation,
"u":string.ascii_uppercase,
"l":string.ascii_lowercase,
"d":string.digits,
}
return "".join(d[k] for k in d if k in omissions) if include else "".join(d[k] for k in d if not k in omissions)
def weights(string,omissions='',include=False):
# return {i:freq(i,string) for i in set(string)} ## if you only want to measure elements of the string
return {i:freq(i,string) for i in ascii(omissions,include)}
heaviest = lambda string: max(string,key=weights(string).get)
if __name__ == '__main__':
s = 'abcda'
print(s,heaviest(s),sep='\n\t') # 'a'
strings = [''.join(sample(ascii('l',True)[:4],randint(3,5))) for i in range(4)]
for s in strings:
path = s+'.csv'
with open(path, "w", newline="") as ECA:
writer = csv.writer(ECA)
writer.writerows(weights(s,'l',True).items())
os.startfile(path)
I am trying to check for the nrf2 binding motif using regular expression with python. I have done that with R using JASPAR2018 PWM, but due to few issues with JASPAR.
I wish to redo it using python.
Attempt
from Bio import SeqIO
from itertools import islice
import pandas as pd
#Creating Reverese Complements
def reverseComp(Seq):
seq = Seq.upper()
d = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
seq = seq[::-1]
rc_seq = "".join([d[nuc] for nuc in seq])
except KeyError:
return "Not Viable DNA Seq"
return rc_seq
def genSeq(genome_path, chrom, chromstart, chromend):
if bool(re.search('gz', genome_path)) | bool(re.search('fa', genome_path)) | bool(re.search('fasta', genome_path)):
if bool(re.search('gz', genome_path)) == True:
genome = SeqIO.parse(gzip.open(genome_path, 'rt'),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom) , None))
seq = str(seq_gen.seq[chromstart:chromend])
else:
genome = SeqIO.parse(open(genome_path),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom)+1 , None))
seq = str(seq_gen.seq[chromstart:chromend])
elif bool(re.search('2bit', genome_path)):
tbGenome = tbr.TwoBitFile(genome_path)
seq = tbGenome[chrom][chromstart:chromend]
else:
raise Exception('File type not recognized')
return (seq).upper()
pat = "[AGC]TGA[CTG][ATCG][CAT][AGT]GC[ATCG]"
pattern = re.compile(pat)
motifDF = []
motifQuant = []
with open('/Users/kalyanidhusia/Desktop/nrf2_R/ENCFF126HBJ.bed') as f:
for line in f:
peak = list(line.split())
seq = genSeq('hg19.fa', peak[0], int(peak[1]), int(peak[2]))
rSeq = reverseComp(seq)
sequences = []
for result in re.finditer(pattern, seq):
sequences.append("".join(result.groups()))
for result in re.finditer(pattern, rSeq):
sequences.append("".join(result.groups()))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0], 'chromstart':peak[1], 'chromend':peak[2]})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
Error
This is the error I am getting:
ipython-input-3-2e7ebdf92205> in genSeq(genome_path, chrom,
chromstart, chromend) 25 identifiers = [seq_record.id for seq_record
in genome] ---> 26 seq_gen = next(islice(genome,
identifiers.index(chrom)+1 , None)) 27 seq =
str(seq_gen.seq[chromstart:chromend]) 28 elif bool(re.search('2bit',
genome_path)): StopIteration:
How do I solve this problem?
To the above problem, I was able to solve it by tweaking with my code a little. Here is the solved example for you guys and my problem with the code below:
motif = '[REGULAR_EXPRESSION_FOR_YOUR_MOTIF]'
regBS = re.compile(motif)
motifDF = []
motifQuant = []
genome = tbr.TwoBitFile('/Path_to_your_genomefile_in_2bit.2bit/')
with open('/Path_to_your.bedfile/') as f:
for line in f:
if line.startswith('track') == False:
peak = list(line.split())
seq = (genome[peak[0]][int(peak[1]):int(peak[2])]).upper()
rSeq = reverseComp(seq)
sequences = []
sequences.extend(re.findall(regBS, seq))
sequences.extend(re.findall(regBS, rSeq))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0],'chromstart':peak[1], 'chromend':peak[2], 'NR':'NRF2'})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
dist_reg.head()
n = 5
x = [len(i[6+n:-6-n]) for i in search_reg['binding']]
This code generates the peak sequences that I want and store it in search_reg[binding] but it also stores a space seperated number with it. I need to store them in two different columns. Any suggestions?
I cannot seem to be able to get this working. I have a txt file where in one line, there are number up to 250000 and in the other, there are numbers from 0 to 4. I want to count how many times there are instances where the number is less of equal to 50000 and has corresponding number 0. I would like to then write it to a file. For some reason, it doesnt recognize any instances where the number is less of equal to 50000.
import sys
import argparse
import operator
def main (argv):
parser = argparse.ArgumentParser(description='Get the variants that are present at least 5% of the time ')
parser.add_argument('infile', help='file to process')
parser.add_argument('outfile', help='file to produce')
args = parser.parse_args()
results =[]
c0 = int("0")
count = 0
a = int("50000")
with open(args.infile, "r") as f, open(args.outfile, "w") as of:
file_in = f.readlines()
for line in file_in:
temp = line.split()
if temp[0]<= a and temp[1]== c0:
count+=1
first_trajectory_cluster0 = str(count)
of.write(first_trajectory_cluster0 + "cluster0" + "\n")
if __name__ == "__main__":
main(sys.argv)
You're comparing a string to an integer in each case:
temp = line.split()
if temp[0]<= a and temp[1]== c0:
You need to convert:
temp = line.split()
if int(temp[0]) <= a and int(temp[1]) == c0:
This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets
The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)
You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)
I am (attempting) to write a program that searches through a hex file for instances of a hex string between two values, eg. Between D4135B and D414AC, incrementing between the first value until the second is reached- D4135B, D4135C, D4135D etc etc.
I have managed to get it to increment etc, but it’s the search part I am having trouble with.
This is the code I have so far, it's been cobbled together from other places and I need to make it somehow output all search hits into the output file (file_out)
I have exceeded the limit of my Python understanding and I'm sure there's probably a much easier way of doing this. I would be very grateful for any help.
def search_process(hx): # searching for two binary strings
global FLAG
while threeByteHexPlusOne != threeByteHex2: #Keep incrementing until second value reached
If Flag:
if hx.find(threeByteHex2) != -1:
FLAG = False #If threeByteHex = ThreeByteHexPlusOne, end search
Print (“Reached the end of the search”,hx.find(threeByteHexPlusOne))
Else:
If hx.find(threeByteHexPlusOne) != -1:
FLAG = True
Return -1 #If no results found
if __name__ == '__main__':
try:
file_in = open(FILE_IN, "r") #opening input file
file_out = open(FILE_OUT, 'w') #opening output file
hx_read = file_in.read #read from input file
tmp = ''
found = ''
while hx_read: #reading from file till file is empty
hx_read = tmp + hx_read
pos = search_process(hx_read)
while pos != -1:
hex_read = hx_read[pos:]
if FLAG:
found = found + hx_read
pos = search_process(hx_read)
tmp = bytes_read[]
hx_read = file_in.read
file_out.write(found) #writing to output file
except IOError:
print('FILE NOT FOUND!!! Check your filename or directory/PATH')
Here's a program that looks through a hex string from a file 3 bytes at a time and if the 3-byte hex string is between the given hex bounds, it writes it to another file. It makes use of generators to make getting the bytes from the hex string a little cleaner.
import base64
import sys
_usage_string = 'Usage: python {} <input_file> <output_file>'.format(sys.argv[0])
def _to_base_10_int(value):
return int(value, 16)
def get_bytes(hex_str):
# Two characters equals one byte
for i in range(0, len(hex_str), 2):
yield hex_str[i:i+2]
def get_three_byte_hexes(hex_str):
bytes = get_bytes(hex_str)
while True:
try:
three_byte_hex = next(bytes) + next(bytes) + next(bytes)
except StopIteration:
break
yield three_byte_hex
def find_hexes_in_range(hex_str, lower_bound_hex, upper_bound_hex):
lower_bound = _to_base_10_int(lower_bound_hex)
upper_bound = _to_base_10_int(upper_bound_hex)
found = []
for three_byte_hex in get_three_byte_hexes(hex_str):
hex_value = _to_base_10_int(three_byte_hex)
if lower_bound <= hex_value < upper_bound:
found.append(three_byte_hex)
return found
if __name__ == "__main__":
try:
assert(len(sys.argv) == 3)
except AssertionError:
print _usage_string
sys.exit(2)
file_contents = open(sys.argv[1], 'rb').read()
hex_str = base64.decodestring(file_contents).encode('hex')
found = find_hexes_in_range(hex_str, 'D4135B', 'D414AC')
print('Found:')
print(found)
if found:
with open(sys.argv[2], 'wb') as fout:
for _hex in found:
fout.write(_hex)
Check out some more info on generators here