BioPython: extracting sequence IDs from a Blast output file - python

I have a BLAST output file in XML format. It is 22 query sequences with 50 hits reported from each sequence. And I want to extract all the 50x22 hits. This is the code I currently have, but it only extracts the 50 hits from the first query.
from Bio.Blast import NCBIXM
blast_records = NCBIXML.parse(result_handle)
blast_record = blast_records.next()
save_file = open("/Users/jonbra/Desktop/my_fasta_seq.fasta", 'w')
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
save_file.write('>%s\n' % (alignment.title,))
save_file.close()
Somebody have any suggestions as to extract all the hits? I guess I have to use something else than alignments.
Hope this was clear. Thanks!
Jon

This should get all records. The novelty compared with the original is the
for blast_record in blast_records
which is a python idiom to iterate through items in a "list-like" object, such as the blast_records (checking the CBIXML module documentation showed that parse() indeed returns an iterator)
from Bio.Blast import NCBIXM
blast_records = NCBIXML.parse(result_handle)
save_file = open("/Users/jonbra/Desktop/my_fasta_seq.fasta", 'w')
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
save_file.write('>%s\n' % (alignment.title,))
#here possibly to output something to file, between each blast_record
save_file.close()

I used this code for extract all the results
from Bio.Blast import NCBIXML
for record in NCBIXML.parse(open("rpoD.xml")) :
print "QUERY: %s" % record.query
for align in record.alignments :
print " MATCH: %s..." % align.title[:60]
for hsp in align.hsps :
print " HSP, e=%f, from position %i to %i" \
% (hsp.expect, hsp.query_start, hsp.query_end)
if hsp.align_length < 60 :
print " Query: %s" % hsp.query
print " Match: %s" % hsp.match
print " Sbjct: %s" % hsp.sbjct
else :
print " Query: %s..." % hsp.query[:57]
print " Match: %s..." % hsp.match[:57]
print " Sbjct: %s..." % hsp.sbjct[:57]
print "Done"
or for less details
from Bio.Blast import NCBIXML
for record in NCBIXML.parse(open("NC_003197.xml")) :
#We want to ignore any queries with no search results:
if record.alignments :
print "QUERY: %s..." % record.query[:60]
for align in record.alignments :
for hsp in align.hsps :
print " %s HSP, e=%f, from position %i to %i" \
% (align.hit_id, hsp.expect, hsp.query_start, hsp.query_end)
print "Done"
I used this site
http://www2.warwick.ac.uk/fac/sci/moac/currentstudents/peter_cock/python/rpsblast/

Related

Get rid of parenthesis in output

I think this is an easy question for you as i am a beginner on python3.
When printing header of fasta file it contains parenthesis. How can i remove them ??
import sys
from Bio import Entrez
from Bio import SeqIO
#define email for entrez login
db = "nuccore"
Entrez.email = "someone#email.com"
#load accessions from arguments
if len(sys.argv[1:]) > 1:
accs = sys.argv[1:]
else: #load accesions from stdin
accs = [ l.strip() for l in sys.stdin if l.strip() ]
#fetch
sys.stderr.write( "Fetching %s entries from GenBank: %s\n" % (len(accs), ", ".join(accs[:10])))
for i,acc in enumerate(accs):
try:
sys.stderr.write( " %9i %s \r" % (i+1,acc))
handle = Entrez.efetch(db=db, rettype="fasta", id=acc)
seq_record = SeqIO.read(handle, "fasta")
if (len(seq_record.seq) > 0):
header = ">" + seq_record.description + " Len:" , len(seq_record.seq)
print(header)
print(seq_record.seq)
except:
sys.stderr.write( "Error! Cannot fetch: %s \n" % acc)
./acc2fasta.py 163345 303239
It will return
(">M69206.1 Bovine MHC class I AW10 mRNA (haplotype AW10), 3' end Len:", 1379)
TCCTGCTGCTCTCGGGGGTCCTGGTCCTGACCGAGACCCGGGCTGGCTCCCACTCGATGAGGTATTTCAGCACCGCCGTGTCCCGGCCCGGCCTCGGGGAGCCCCGGTACCTGGAAGTCGGCTACGTGGACGACACGCAGTTCGTGCGGTTTGACAGCGACGCCCCGAATCCGAGGATGGAGCCGCGGGCGCGGTGGGTGGAGCAGGAGGGGCCGGAGTATTGGGATCGGGAGACGCAAAGGGCCAAGGGCAACGCACAATTTTTCCGAGTGAGCCTGAACAACCTGCGCGGCTACTACAACCAGAGCGAGGCCGGGTCTCACACCCTCCAGTGGATGTCCGGCTGCTACGTGGGGCCGGACGGGCGTCCTCCGCGCGGGTTCATGCAGTTCGGCTACGACGGCAGAGATTACCTCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCGGTGGAGACGATGGCTCAGATCTCCAAACGCAAGATGGAGGCGGCCGGTGAAGCTGAGGTACAGAGGAACTACCTGGAGGGCCGGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACACGCTGCTGCGCGCAGACCCTCCAAAGGCACATGTGACCCGTCACCCGATCTCTGGTCGTGAGGTCACCCTGAGGTGCTGGGCCCTGGGCTTCTACCCTGAAGAGATCTCACTGACCTGGCAGCGCAATGGGGAGGACCAGACCCAGGACATGGAGCTTGTGGAGACCAGGCCTTCAGGGGACGGAAACTTCCAGAAGTGGGCGGCCCTGTTGGTGCCTTCTGGAGAGGAGCAGAAATACACATGCCAAGTGCAGCACGAGGGGCTTCAGGAGCCCCTCACCCTGAAATGGGAACCTCCTCAGCCCTCCTTCCTCACCATGGGCATCATTGTTGGCCTGGTTCTCCTCGTGGTCACTGGAGCTGTGGTGGCTGGAGTTGTGATCTGCATGAAGAAGCGCTCAGGTGAAAAACGAGGGACTTATATCCAGGCTTCAAGCAGTGACAGTGCCCAGGGCTCTGATGTGTCTCTCACGGTTCCTAAAGTGTGAGACACCTGCCTTCGGGGGACTGAGTGATGCTTCATCCCGCTATGTGACATCAGATCCCCGGAACCCCTTTTTCTGCAGCTGCATCTGAATGTGTCAGTGCCCCTATTCGCATAAGTAGGAGTTAGGGAGACTGGCCCACCCATGCCCACTGCTGCCCTTCCCCACTGCCGTCCCTCCCCACCCTGACCTGTGTTCTCTTCCCTGATCCACTGTCCTGTTCCAGCAGAGACGAGGCTGGACCATGTCTATCCCTGTCTTTGCTTTATATGCACTGAAAAATGATATCTTCTTTCCTTATTGAAAATAAAATCTGTC
Error! Cannot fetch: 303239
How to get rid of parenthesis in output ??
header = ">" + seq_record.description + " Len:" , len(seq_record.seq)
print(header)
You're printing the representation of the tuple by doing so, with commas (expected) but also parentheses (unrequired)
The best way would be to join the data instead, so comma is inserted between the string fields, but tuple representation is left out:
print(",".join(header))
In your case it's a little tricker, you have to convert non-string arguments to string (tuple representation did the conversion but join doesn't):
print(",".join([str(x) for x in header]))
result:
>M69206.1 Bovine MHC class I AW10 mRNA (haplotype AW10), 3' end Len:,1379

Python simple Multi-threaded Downloader file corrupted

This is my first post. I have been doing python programming for quite sometime and recently was working on a multi-threaded downloader. But the problem is that my file (jpg is my target ) gets corrupted . Also with the followinf input : http://www.aumathletics.com/images_web/headerAUMLogo.jpg
it shows error
while with the input :
http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg
the file gets corrupted.
Here is the code:-
import os, sys, requests
import threading
import urllib2
import time
URL = sys.argv[1]
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=5):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = "image.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
the indentation here might be wrong so here is the code pastebin(dot)com/wGEkp878
I would be very grateful if someone could point the error
EDIT: suggested by a guy
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
first = i if i == 0 else buildRange().start(i, value, numsplits)
second = buildRange().end(i, value, numsplits)
lst.append("{}-{}".format(first, second))
return lst
can anyone tell me hoe to keep the part files downloaded with names like part1 part2 so on
It turns out the file must be opened in binary mode, with 'wb' instead of 'w'. If opened with just 'w' a bunch of extra characters will be written. This has something to do with derpy windows vs. linux new line semantics. If you use 'wb' it will write exactly what you put in into the file.
EDIT:
If you want to store the individual file parts you can change
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
To
# reassemble file in correct order
for _idx,chunk in sorted(dataDict.iteritems()):
with open(fileName + str(".part-") + str(_idx), 'wb') as fh:
fh.write(chunk)
print "Finished Writing file %s" % fileName
#print 'file size {} bytes'.format(os.path.getsize(fileName))

Getting Python Script(Output) to JSON file

I am running a python script and I am trying to extract a part of it and convert it to JSON file so that I can pass it to Google Visualization.
I have included my python script and output and detailed info.
What I am doing:
I am trying to run this Python script from AlchemyAPI
https://github.com/AlchemyAPI/alchemyapi-twitter-python
My output is/as follows:
I am getting the stats from the output.
##########################################################
# The Tweets #
##########################################################
#uDiZnoGouD
Date: Mon Apr 07 05:07:19 +0000 2014
To enjoy in case you win!
To help you sulk in case you loose!
#IndiavsSriLanka #T20final http://t.co/hRAsIa19zD
Document Sentiment: positive (Score: 0.261738)
##########################################################
# The Stats #
##########################################################
Document-Level Sentiment:
Positive: 3 (60.00%)
Negative: 1 (20.00%)
Neutral: 1 (20.00%)
Total: 5 (100.00%)
This is the sample code for the stats:
def stats(tweets):
"""
Calculate and print out some basic summary statistics
INPUT:
tweets -> an array containing the analyzed tweets
"""
#init
data = {}
data['doc'] = {}
data['doc']['positive'] = 0
data['doc']['negative'] = 0
data['doc']['neutral'] = 0
data['doc']['total'] = 0
data['entity'] = {}
data['entity']['positive'] = 0
data['entity']['negative'] = 0
data['entity']['neutral'] = 0
data['entity']['total'] = 0
#loop through the tweets and count up the positive, negatives and neutrals
for tweet in tweets:
if 'entity' in tweet['sentiment']:
data['entity'][tweet['sentiment']['entity']['type']] += 1
data['entity']['total'] += 1
if 'doc' in tweet['sentiment']:
data['doc'][tweet['sentiment']['doc']['type']] += 1
data['doc']['total'] += 1
#Make sure there are some analyzed tweets
if data['doc']['total'] == 0 and data['entity']['total'] == 0:
print 'No analysis found for the Tweets'
sys.exit()
#print the stats
print ''
print ''
print '##########################################################'
print '# The Stats #'
print '##########################################################'
print ''
print ''
if data['entity']['total'] > 0:
print 'Entity-Level Sentiment:'
print 'Positive: %d (%.2f%%)' % (data['entity']['positive'], 100.0*data['entity']['positive']/data['entity']['total'])
print 'Negative: %d (%.2f%%)' % (data['entity']['negative'], 100.0*data['entity']['negative']/data['entity']['total'])
print 'Neutral: %d (%.2f%%)' % (data['entity']['neutral'], 100.0*data['entity']['neutral']/data['entity']['total'])
print 'Total: %d (%.2f%%)' % (data['entity']['total'], 100.0*data['entity']['total']/data['entity']['total'])
print ''
print ''
if data['doc']['total'] > 0:
print 'Document-Level Sentiment:'
print 'Positive: %d (%.2f%%)' % (data['doc']['positive'], 100.0*data['doc']['positive']/data['doc']['total'])
print 'Negative: %d (%.2f%%)' % (data['doc']['negative'], 100.0*data['doc']['negative']/data['doc']['total'])
print 'Neutral: %d (%.2f%%)' % (data['doc']['neutral'], 100.0*data['doc']['neutral']/data['doc']['total'])
print 'Total: %d (%.2f%%)' % (data['doc']['total'], 100.0*data['doc']['total']/data['doc']['total'])
Problem Statement:
I would like to get the positive, negitive, neutral sentiment in JSON format so that I can pass them to google visualization. How can I make a JSON file which contains my final stats. (Positive, Negitive and Neutral)?
import json
json_data = json.dumps(data)
if there is something like date time object make sure to convert it into str so that it can be json serializable.
FYI : json.loads used to load it back from json_object.
here make a dict, list or python object.
data_list = []
temp = 'Positive: %d (%.2f%%)' % (data['entity']['positive'], 100.0*data['entity']['positive']/data['entity']['total'])
data_list.append(temp)
temp = print 'Total: %d (%.2f%%)' % (data['entity']['total'], 100.0*data['entity']['total']/data['entity']['total'])
data_list.append(temp)
same as for other data_ques.
now you can dump data.
json.dumps(data_list)
Note - I think you can do all these stuff after respond in json format. here you have data_dict that contains all information you are just formatting them here instead you can this on clint side of response.
as dict object data contains all info you can dump that only and do formatting on client side which receiving json response.
json.dumps(data)

python xml parsing; multiple xml files

I have multiple xml files with similar elements. How to extract the child elements from multiple files? I wrote a sample code which extracts the required elements from a single file but I need to extract from multiple xml files.the major problem here is it should print the required feilds only if admin-server-name equals to name
#!/usr/bin/python
import commands
from xml.dom.minidom import parse
from xml.dom import minidom
import xml.dom.minidom
cmd = "find . / -name config.xml 2>/dev/null |grep -w config/config.xml"
p = commands.getoutput(cmd)
count=0;
admnprt=[]
lstnaddr=[]
dmnver=[]
admnsrvnme=[]
serverName=[]
word=p.rstrip().split('\n')
print word
print "no of domains are %s" %len(word)
for line in word :
count =count +1
DomTree =xml.dom.minidom.parse(line)
domain=DomTree.documentElement
admnprt=domain.getElementsByTagName("administration-port")
lstnaddr=domain.getElementsByTagName("listen-address")
admnsrvnme=domain.getElementsByTagName("admin-server-name")
dmnver=domain.getElementsByTagName("domain-version")
serverName = document.getElementsByTagName("name")
if(serverName == "admnsrvnme"):
for i in admnprt:
print "administration-port for %s domain is " % count + " " + "%s" %i.childNodes[0].data
for j in lstnaddr:
print "listen-address for %s domain is " % count + " " + "%s" %j.childNodes[0].data
for a in admnsrvnme:
print "admin-server-name for %s domain is " % count + " " + "%s" %a.childNodes[0].data
for b in dmnver:
print "domain-version for %s domain is " % count + " " + "%s" %b.childNodes[0].data
Put your code to function and call it for all files.
#!/usr/bin/python
import sys
from xml.dom.minidom import parse
import xml.dom.minidom
def parse_file(filename):
DOMTree = xml.dom.minidom.parse(filename)
domain=DOMTree.documentElement
name=domain.getElementsByTagName("domain-version")[0]
print" %s" % name.childNodes[0].data
adminservername=domain.getElementsByTagName("admin-server-name")[0]
print " %s" % adminservername.childNodes[0].data
listenaddress=domain.getElementsByTagName("listen-address")[0]
print " %s" % listenaddress.childNodes[0].data
administrationport=domain.getElementsByTagName("administration-port")[0]
print " %s" % administrationport.childNodes[0].data
for argument in sys.argv[1:] :
parse_file(argument)
Now you can run script with many filenames
yourscript.py filename1 filename2

Trying to get output of ffprobe into variable

I am trying to grab the ffprobe values from the video file into a variable that I can compare against others or move the value into a database. The question I have; Is there a better way of doing it than below?
I don't like the multiple if/elif/line.startswith statements and I am not sure of split is the best way of getting the ffprobe values?
#!/usr/bin/python
import os, sys, subprocess, shlex, re, fnmatch
from subprocess import call
videoDrop_dir="/mnt/VoigtKampff/Temp/_Jonatha/test_drop"
for r,d,f in os.walk(videoDrop_dir):
for files in f:
print "Files: %s" % files
if files.startswith(('._', '.')):
print "This file: %s is not valid" % files
elif files.endswith(('.mov', '.mpg', '.mp4', '.wmv', '.mxf')):
fpath = os.path.join(r, files)
def probe_file(fpath):
cmnd = ['ffprobe', '-show_format', '-show_streams', '-pretty', '-loglevel', 'quiet', fpath]
p = subprocess.Popen(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print files
out, err = p.communicate()
print "===============================OUTPUT START: %s ===============================" % files
print out
for line in out.split('\n'):
line = line.strip()
if line.startswith('codec_name='):
s = line
codec_name = s.split('codec_name=', 1)
print "Codec is: %s" % codec_name[1]
codec_1 = codec_name[1]
elif line.startswith('codec_type='):
s = line
codec_type = s.split('codec_type=', 1)
print "Codec type is: %s" % codec_type[1]
codec_type1 = codec_type[1]
elif line.startswith('codec_long_name=', 1):
s = line
codec_long_name = s.split('codec_long_name=', 1)
print "Codec long name: %s" % codec_long_name[1]
codec_long_name = codec_long_name[1]
elif line.startswith('format_long_name=', 1):
s = line
format_long_name = s.split('format_long_name=', 1)
print "Format long name: %s" % format_long_name[1]
format_long_name = format_long_name[1]
elif line.startswith('width='):
s = line
width = s.split('width=', 1)
print "Video pixel width is: %s" % width[1]
p_width = width[1]
elif line.startswith('height='):
s = line
height = s.split('height=', 1)
print "Video pixel height is: %s" % height[1]
p_height = height[1]
elif line.startswith('bit_rate='):
s = line
bit_rate = s.split('bit_rate=', 1)
print "Bit rate is: %s" % bit_rate[1]
bit_rate1 = bit_rate[1]
elif line.startswith('display_aspect_ratio=', 1):
s = line
display_aspect_ratio = s.split('display_aspect_ratio=', 1)
print "Display aspect ratio: %s" % display_aspect_ratio[1]
display_aspect_ratio1 = display_aspect_ratio[1]
elif line.startswith('avg_frame_rate=', 1):
s = line
avg_frame_rate = s.split('avg_frame_rate=', 1)
print "Average Frame Rate: %s" % avg_frame_rate[1]
avg_frame_rate1 = avg_frame_rate[1]
print "===============================OUTPUT FINISH: %s ===============================" % files
if err:
print "===============================ERROR: %s ===============================" % files
print err
probe_file(fpath)
else:
if not files.startswith(('.mov', '.mpg', '.mp4', '.wmv', '.mxf')):
print "This file: %s is not a valid video file" % files
This is a bit late, but hopefully it helps others searching for a similar answer.
import json, subprocess
# grab info about video_file
ffprobe_cmd = '/home/ubuntu/bin/ffprobe -v quiet -print_format json -show_format -show_streams - i ' + v + ' 2>&1'
# print ffprobe_cmd
s = subprocess.Popen(ffprobe_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
ffprobe_out, err = s.communicate()
ffprobe_dict = json.loads(ffprobe_out)
From here, I re-use a common method, search_dict, which can be used like:
search_dict(ffprobe_dict, 'height')
def search_dict(my_dict, field):
"""Takes a dict with nested lists and dicts,
and searches all dicts for a key of the field
provided.
"""
fields_found = []
for key, value in my_dict.iteritems():
if key == field:
fields_found.append(value)
elif isinstance(value, dict):
results = search_dict(value, field)
for result in results:
fields_found.append(result)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
more_results = search_dict(item, field)
for another_result in more_results:
fields_found.append(another_result)
return fields_found
You should ask this question on https://codereview.stackexchange.com/

Categories