How to do motif search using python? - python

I am trying to check for the nrf2 binding motif using regular expression with python. I have done that with R using JASPAR2018 PWM, but due to few issues with JASPAR.
I wish to redo it using python.
Attempt
from Bio import SeqIO
from itertools import islice
import pandas as pd
#Creating Reverese Complements
def reverseComp(Seq):
seq = Seq.upper()
d = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
seq = seq[::-1]
rc_seq = "".join([d[nuc] for nuc in seq])
except KeyError:
return "Not Viable DNA Seq"
return rc_seq
def genSeq(genome_path, chrom, chromstart, chromend):
if bool(re.search('gz', genome_path)) | bool(re.search('fa', genome_path)) | bool(re.search('fasta', genome_path)):
if bool(re.search('gz', genome_path)) == True:
genome = SeqIO.parse(gzip.open(genome_path, 'rt'),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom) , None))
seq = str(seq_gen.seq[chromstart:chromend])
else:
genome = SeqIO.parse(open(genome_path),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom)+1 , None))
seq = str(seq_gen.seq[chromstart:chromend])
elif bool(re.search('2bit', genome_path)):
tbGenome = tbr.TwoBitFile(genome_path)
seq = tbGenome[chrom][chromstart:chromend]
else:
raise Exception('File type not recognized')
return (seq).upper()
pat = "[AGC]TGA[CTG][ATCG][CAT][AGT]GC[ATCG]"
pattern = re.compile(pat)
motifDF = []
motifQuant = []
with open('/Users/kalyanidhusia/Desktop/nrf2_R/ENCFF126HBJ.bed') as f:
for line in f:
peak = list(line.split())
seq = genSeq('hg19.fa', peak[0], int(peak[1]), int(peak[2]))
rSeq = reverseComp(seq)
sequences = []
for result in re.finditer(pattern, seq):
sequences.append("".join(result.groups()))
for result in re.finditer(pattern, rSeq):
sequences.append("".join(result.groups()))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0], 'chromstart':peak[1], 'chromend':peak[2]})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
Error
This is the error I am getting:
ipython-input-3-2e7ebdf92205> in genSeq(genome_path, chrom,
chromstart, chromend) 25 identifiers = [seq_record.id for seq_record
in genome] ---> 26 seq_gen = next(islice(genome,
identifiers.index(chrom)+1 , None)) 27 seq =
str(seq_gen.seq[chromstart:chromend]) 28 elif bool(re.search('2bit',
genome_path)): StopIteration:
How do I solve this problem?

To the above problem, I was able to solve it by tweaking with my code a little. Here is the solved example for you guys and my problem with the code below:
motif = '[REGULAR_EXPRESSION_FOR_YOUR_MOTIF]'
regBS = re.compile(motif)
motifDF = []
motifQuant = []
genome = tbr.TwoBitFile('/Path_to_your_genomefile_in_2bit.2bit/')
with open('/Path_to_your.bedfile/') as f:
for line in f:
if line.startswith('track') == False:
peak = list(line.split())
seq = (genome[peak[0]][int(peak[1]):int(peak[2])]).upper()
rSeq = reverseComp(seq)
sequences = []
sequences.extend(re.findall(regBS, seq))
sequences.extend(re.findall(regBS, rSeq))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0],'chromstart':peak[1], 'chromend':peak[2], 'NR':'NRF2'})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
dist_reg.head()
n = 5
x = [len(i[6+n:-6-n]) for i in search_reg['binding']]
This code generates the peak sequences that I want and store it in search_reg[binding] but it also stores a space seperated number with it. I need to store them in two different columns. Any suggestions?

Related

I want to parallelize this code to execute faster for 800000 sentences

from app import getPhonemes
import pandas as pd
import sys
triphones = []
def phonemize(sentence):
tokens = sentence.split(' ')
phonemes = getPhonemes(tokens)
return '$'.join(phonemes)
def generateTriphones(phonemes):
triphones = []
for i in range(len(phonemes)):
for j in range(len(phonemes)):
for k in range(len(phonemes)):
triphones.append(phonemes[i] + ' ' + phonemes[j] + ' ' + phonemes[k])
return triphones
def scoreSentence(sentence,phonemes):
flag = 0
global triphones
score = 0
tokens = sentence.split('$')
uniqueTokens = set(tokens)
triphoneticTokens = [token for token in uniqueTokens if token.count(' ') > 1]
for token in triphoneticTokens:
for triphone in triphones:
if token.find(triphone) != -1:
score += 1
triphones.remove(triphone)
if triphones == []:
flag = -1
return score, flag
def Process(fil):
global triphones
file = open('itudict/vocab.phoneme', 'r',encoding='utf-8')
data = []
for line in file:
data.append(line.strip())
file.close()
phonemes = data[4:]
triphones = generateTriphones(phonemes)
data = pd.read_csv(fil+'.csv')
data = data.drop(['score','covered_vocab'],axis=1)
i = 1
while len(data) > 0:
print('Processing File: '+str(i))
sentencee = data[:10000]
data = data[10000:]
sentences = sentencee['sentence'].tolist()
phonemes = []
scores = []
for j in range(len(sentences)):
if j%1000 == 0:
print('Processing Sentence: '+str(j))
print(len(triphones))
phones = phonemize(sentences[j])
score, flag = scoreSentence(phones,phonemes)
if flag == -1:
data = []
phonemes.append(phones)
scores.append(score)
data['Phonemes'] = phonemes
data['score'] = scores
data.to_csv(fil+'phonemized'+str(i)+'.csv', index=False)
i += 1
if __name__ == '__main__':
Process(sys.argv[1])
I am trying to generate the phonemes for 800000 sentences. The model which am using is G2P which phonemizes the sentence. after phonemization i am calculating the scores. the phoneme array which i am using for calculating scores is of size 2620000.
The length of sentences are 800000 and the code is taking days, can somebody parallelize this code or suggest some solution
I want to parallelize this code to execute faster.

List inside dictionary

I am unable to get the last 3 digits of the id number.
from datetime import datetime
def days_to_birthday(date):
datetime_object = datetime.strptime(date, "%Y-%m-%d")
date = datetime_object.date()
num_days = date.timetuple().tm_yday
return num_days
fo = open("Data.txt", 'r') # File containg data
content = [i.rsplit() for i in fo.readlines()]
names = [content[i][0] for i in range(len(content))]
dates = [content[i][1] for i in range(len(content))]
gender = [content[i][2] for i in range(len(content))]
id_numbers = []
mydict = dict(zip(dates, gender))
for i in mydict:
x = days_to_birthday(i)
if mydict.get(i) == "F":x += 500
x = str(x)
if len(x) < 3:x = x.zfill(3)
i = i.split('-')
out = i[0] + x
id_numbers.append(out)
for i in range(len(names)):
print(f"{names[i]} {id_numbers[i]}" )
Running your code would raise SyntaxError: 'return' outside function.
Because Python is a whitespace sensitive language, return num_days must be further indented so it applies within the days_to_birthday function.

Calculating Sentiment with Pandas - Looping Calculation is Slow

I have a pandas dataframe consisting of headlines. I am doing a simple calculation of the sentiment, by tokenizing and comparing the headlines with a list of positive and negative words. I am appending the over all sentiment for the headline into a column and then appending this to the original dataframe and saving as an Excel file.
The resulting and original files are about 12 mb. While the code below works, it is slow; and is taking me a couple of hours to fully read the file and assign the score. Is this normal? Is there anything I can do to speed up the process? I understand that loops within a pandas dataframe column may be slow - what are the alternatives?
# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import pandas as pd
from violencevocabulary import new_words as extended_neg_list
import unicodedata
#function to calculate sentiment
def sentimentanalyzer (country_name,text_type):
data = []
xls_file = pd.ExcelFile('/UsersDesktop/MasterData.xlsx')
df = xls_file.parse(country_name)
text_body = df[text_type]
text_body = pd.Series(text_body)
headlines = text_body.tolist()
for i in headlines:
if type(i) == unicode:
i = unicodedata.normalize('NFKD', i).encode('ascii','ignore')
data.append(i)
# processing the sentiment comparispon files
pos_words = []
neg_words = []
f = open('/Users/positive-words.txt','r')
plines = f.readlines()
for line in plines:
line = line.rstrip('\n')
line = line.lower()
pos_words.append(line)
positive_words = pos_words[35:]
f.close()
g = open('/Users/Desktop/negative-words.txt','r')
nlines = g.readlines()
neg_words = []
for nline in nlines:
nline = nline.strip('\n')
nline = nline.lower()
neg_words.append(nline)
negative_words = neg_words[35:]
g.close()
negative_words = negative_words + extended_neg_list
senti_list = []
for j in data:
tokens = word_tokenize(j)
for k in tokens:
negs = [k for k in tokens if k in negative_words]
negs = len(negs)
pos = [k for k in tokens if k in positive_words]
pos = len(pos)
calc = pos - negs
print calc
senti_list.append(calc)
df2 = pd.Series(senti_list,name="Sentiment")
new_data = pd.concat([df,df2,],axis=1)
new_data_name = '/Users/Desktop/Results/' + country_name + " " + text_type + ".xls"
writer_new_data_name = pd.ExcelWriter(new_data_name, engine='xlsxwriter')
new_data.to_excel(writer_new_data_name,sheet_name='Sheet1')
return

search pattern in sequence and report identity

I have 2 fasta files with sequence's in it.I want to align the sequences in second file to first file and report identity
For example:
File1:
>s1
aaccggactggacatccg
>s2
gtcgactctcggaattg
....
File2:
>a1
actg
>a2
tccg
.....
I want to take the file2 sequences and look in file1 and print the matching with mismatched base in uppercase and identity in csv format
Output
name,a1_alignment,a1_identity,a2_alignment,a2_identity
s1,actg,100,tccg,100
s2,aCtg,95,tcCg,95
Here what I did so far:
import sys
import os,csv
from Bio import SeqIO
from itertools import *
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-m", "--mismatch_threshold", dest="mismatch_threshold", default = 2,
help="This is the number of differences you'll allow between the actualread and your sequence of interest. Default is 2")
(options, args) = parser.parse_args()
if len(sys.argv) != 4:
print "Usage : python search.py <file1> <file2> <fout>"
sys.exit()
f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
fout = open(sys.argv[3],'w')
writer = csv.writer(fout)
def long(f1):
for record in SeqIO.parse(f1,'fasta'):
header = record.name
sequence = record.seq
yield [header, sequence]
def short(f2):
for record in SeqIO.parse(f2,'fasta'):
head = record.name
seq = record.seq
return seq
def alignment(sequence,seq,mismatch_threshold):
l1 = len(sequence)
l2 = len(seq)
alignment = []
for i in range(0,min(l1,l2)):
if sequence[i] == seq[i]:
alignment.append(i)
else:
mismatch = sum( c1 != c2 for c1,c2 in zip(sequence,seq))
if mismatch <= mismatch_threshold:
alignment.append(i)
k = 0
l = 0
for read in alignment:
for letter in read:
if letter == isupper():
pass
else:
if letter == alignment[0].seq[j]:
l +=1
k += 1
k = 0
length = seq
percent = 100*l/len(seq)
#print percent
yield percent
longsequences = long(open(sys.argv[1],'r'))
shortsequences = short(open(sys.argv[2],'r'))
align = alignment(longsequences,shortsequences,options.mismatch_threshold)
for name in head:
writer.writerow(( name +'_alignment' , name + '_identity'))
for s in align:
# print to csv file
I need help in looking the file2 sequences in file1 with mismatches and print the alignment and also in calculating the identity percentage
Error:
File "s.py", line 34, in alignment
l1 = len(sequence)
TypeError: object of type 'generator' has no len()

Making columns of data lists in python

So I have a program, that reads through a bunch of files and appends the necessary data that I need. I need to now take those particular data and show them as a list. To be more specific, these are the parameters I have:
a = Source, b = luminosity, c = luminosity error, d = HST, e = XRS, f = gmag, g = z, and h = rh
I want to display this in a list, each defining a particular column. I just don't know where exactly I should insert the print statement among the various for loops I've done to do this.
I would appreciate any help! Here's the program (the main focus is in the for loops done and how they iterate through the data, and don't worry about indentations, the program so far works I just need to display the data appended in columns):
import sys
import os
import re
import urllib
import urllib2
from os.path import basename
import urlparse
import shutil
base_dirname = '/projects/XRB_Web/apmanuel/499/'
base_sourcefile = base_dirname + 'Sources.txt'
try:
file = open(base_sourcefile, 'r')
except IOError:
print 'Cannot open: '+base_sourcefile
Source = []
Finallist = []
ACS = []
SRC = []
for line in file:
data_line_check = (line.strip())
if data_line_check:
line = re.sub(r'\s+', ' ', line)
point = line.split('|')
temp_source = (point[0]).strip()
if temp_source and len(point) == 3:
Source = (point[0]).strip()
Source = re.sub(r'\s', '_', Source)
print Source+"\n"
temp_finallist = (point[1]).strip()
if temp_finallist:
Finallistaddress = (point[1]).strip()
Finallistaddress = re.sub(r'\s', '_', Finallistaddress)
Luminositybase_dirname1 = '/projects/XRB_Web/apmanuel/499/Lists/' + Finallistaddress
try:
file2 = open(Luminositybase_dirname1, 'r')
except IOError:
print 'Cannot open: '+Luminositybase_dirname1
source = []
luminosity = []
luminosityerr = []
for line in file2:
pointy = line.split()
a = int(pointy[0])
b = float(pointy[5])
c = float(pointy[6])
source.append(a)
luminosity.append(b)
luminosityerr.append(c)
temp_HST = (point[2]).strip()
if temp_HST:
HSTaddress = (point[2]).strip()
HSTaddress = re.sub(r'\s', '_', HSTaddress)
HSTbase_dirname2 = '/projects/XRB_Web/apmanuel/499/Lists/' + HSTaddress
try:
file3 = open(HSTbase_dirname2, 'r')
except IOError:
print 'Cannot open: '+HSTbase_dirname2
HST = []
for line in file3:
pointy2 = line.split()
d = int(pointy2[0])
HST.append(d)
temp_XRS = (point[3]).strip()
if temp_XRS:
XRSaddress = (point[3]).strip()
XRSaddress =re.sub(r'\s', '_', XRSaddress)
XRSbase_dirname3 = '/projects/XRB_Web/apmanuel/499/Lists/' + XRSaddress
try:
file4 = open(XRSbase_dirname3, 'r')
except IOError:
print 'Cannot open: '+XRSbase_dirname3
XRS = []
for line in file4:
pointy3 = line.split()
e = int(pointy3[0])
XRS.append(e)
temp_others = (point[4]).strip()
if temp_others:
othersaddress = (point[4]).strip()
othersaddress =re.sub(r'\s', '_', othersaddress)
othersbase_dirname4 = '/projects/XRB_Web/apmanuel/499/Lists/' + othersaddress
try:
file5 = open(othersbase_dirname4, 'r')
except IOError:
print 'Cannot open: '+othersbase_dirname4
gmag = []
z = []
rh = []
for line in file5:
pointy4 = line.split()
f = float(pointy4[3])
g = float(pointy4[5])
h = float(pointy4[7])
rh.append(f)
gmag.append(g)
z.append(h)
this function will return columns for a list of rows. note that this requires the lists to all have an element in the column you are trying to access, though it would be relatively simple to change this if you need it.
def getcolumn(matrix,index): #index specifies which column of the matrix you want. note that like all other list indexes, this starts from 0, not one.
column = []
for row in matrix:
column.append(row[index])
return column

Categories