python xml parsing; multiple xml files - python

I have multiple xml files with similar elements. How to extract the child elements from multiple files? I wrote a sample code which extracts the required elements from a single file but I need to extract from multiple xml files.the major problem here is it should print the required feilds only if admin-server-name equals to name
#!/usr/bin/python
import commands
from xml.dom.minidom import parse
from xml.dom import minidom
import xml.dom.minidom
cmd = "find . / -name config.xml 2>/dev/null |grep -w config/config.xml"
p = commands.getoutput(cmd)
count=0;
admnprt=[]
lstnaddr=[]
dmnver=[]
admnsrvnme=[]
serverName=[]
word=p.rstrip().split('\n')
print word
print "no of domains are %s" %len(word)
for line in word :
count =count +1
DomTree =xml.dom.minidom.parse(line)
domain=DomTree.documentElement
admnprt=domain.getElementsByTagName("administration-port")
lstnaddr=domain.getElementsByTagName("listen-address")
admnsrvnme=domain.getElementsByTagName("admin-server-name")
dmnver=domain.getElementsByTagName("domain-version")
serverName = document.getElementsByTagName("name")
if(serverName == "admnsrvnme"):
for i in admnprt:
print "administration-port for %s domain is " % count + " " + "%s" %i.childNodes[0].data
for j in lstnaddr:
print "listen-address for %s domain is " % count + " " + "%s" %j.childNodes[0].data
for a in admnsrvnme:
print "admin-server-name for %s domain is " % count + " " + "%s" %a.childNodes[0].data
for b in dmnver:
print "domain-version for %s domain is " % count + " " + "%s" %b.childNodes[0].data

Put your code to function and call it for all files.
#!/usr/bin/python
import sys
from xml.dom.minidom import parse
import xml.dom.minidom
def parse_file(filename):
DOMTree = xml.dom.minidom.parse(filename)
domain=DOMTree.documentElement
name=domain.getElementsByTagName("domain-version")[0]
print" %s" % name.childNodes[0].data
adminservername=domain.getElementsByTagName("admin-server-name")[0]
print " %s" % adminservername.childNodes[0].data
listenaddress=domain.getElementsByTagName("listen-address")[0]
print " %s" % listenaddress.childNodes[0].data
administrationport=domain.getElementsByTagName("administration-port")[0]
print " %s" % administrationport.childNodes[0].data
for argument in sys.argv[1:] :
parse_file(argument)
Now you can run script with many filenames
yourscript.py filename1 filename2

Related

How do I skip values with no content?

How do I store the values with content into strings?
I know there has to be a much cleaner and more efficient way of doing this but currently I am struggling to find a way. I would appreciate a set of fresh eyes on this since I must be missing something. I have spent an outlandish time on this.
My objective is:
Check if sheet.values has content -> if so, store as a string
Check if sheet.values has content -> if not, skip or create no string
The priority of this is that sheet.values can contain an undetermined amount of content that needs to be identified. Such as sheet.values filled in being up to [9] one instance but being filled in to [6] another instance. So it needs to account for this.
The sheet.values also have to return as a string as I use makedirs() later in the code (it gets a bit testy this also needs work if you can help)
I know a for loop should be able to help me but just not found the right one just yet.
import os
import pandas as pd
from openpyxl import load_workbook
from pandas.core.indexes.base import Index
os. chdir("C:\\Users\\NAME\\desktop")
workbook = pd.ExcelFile('Example.xlsx')
sheet = workbook.parse('Sheet1')
print (sheet.values[0])
os.getcwd()
path = os.getcwd()
for input in sheet.values:
if any(sheet.values):
if input == None:
break
else:
if any(sheet.values):
sheet.values == input
set
str1 = '1'.join(sheet.values[0])
str2 = '2'.join(sheet.values[1])
str3 = '3'.join(sheet.values[2])
str4 = '4'.join(sheet.values[3])
str5 = '5'.join(sheet.values[4])
str6 = '6'.join(sheet.values[5])
str7 = '7'.join(sheet.values[6])
str8 = '8'.join(sheet.values[7])
str9 = '9'.join(sheet.values[8])
str10 = '10'.join(sheet.values[9])
str11 = '11'.join(sheet.values[10])
str12 = '12'.join(sheet.values[11])
str13 = '13'.join(sheet.values[12])
str14 = '14'.join(sheet.values[13])
str15 = '15'.join(sheet.values[14])
str16 = '16'.join(sheet.values[15])
str17 = '17'.join(sheet.values[16])
str18 = '18'.join(sheet.values[17])
str19 = '19'.join(sheet.values[18])
str20 = '20'.join(sheet.values[19])
str21 = '21'.join(sheet.values[20])
########################ONE################################################
try:
if not os.path.exists(str1):
os.makedirs(str1)
except OSError:
print ("Creation of the directory %s failed" % str1)
else:
print ("Successfully created the directory %s " % str1)
########################TWO################################################
try:
if not os.path.exists(str2):
os.makedirs(str2)
except OSError:
print ("Creation of the directory %s failed" % str2)
else:
print ("Successfully created the directory %s " % str2)
########################THREE################################################
try:
if not os.path.exists(str3):
os.makedirs(str3)
except OSError:
print ("Creation of the directory %s failed" % str3)
else:
print ("Successfully created the directory %s " % str3)
########################FOUR################################################
try:
if not os.path.exists(str4):
os.makedirs(str4)
except OSError:
print ("Creation of the directory %s failed" % str4)
else:
print ("Successfully created the directory %s " % str4)
Note: The makedirs() code runs down till to the full amount of strings
The Excel document shows the following: enter image description here
This script results in: index 9 is out of bounds for axis 0 with size 9
This is truthfully expected as the sheet.values only this amount.
Can anyone help me? I know it is messy
Updated Code
import os
import pandas as pd
from openpyxl import load_workbook
from pandas.core.indexes.base import Index
os. chdir("C:\\Users\\NAME\\desktop")
workbook = pd.ExcelFile('Example.xlsx')
sheet = workbook.parse('Sheet1')
print (sheet.values[0])
os.getcwd()
path = os.getcwd()
print ("The current working Directory is %s" % path)
for col in sheet.values:
for row in range(len(col)):
dir_name = str(row + 1) + col[row]
try:
os.makedirs(dir_name, exist_ok=True)
except OSError:
print ("Creation of the directory %s failed" % dir_name)
else:
print ("Successfully created the directory %s " % dir_name)
it seems like you're trying to read the first column of a csv, and create directories based on the value.
with open(mypath+file) as file_name:
file_read = csv.reader(file_name)
file = list(file_read)
for col in file:
for row in range(len(col)):
dir_name = str(row + 1) + col[row]
try:
# https://docs.python.org/3/library/os.html#os.makedirs
os.makedirs(dir_name, exist_ok=True)
except OSError:
print ("Creation of the directory %s failed" % str1)
else:
print ("Successfully created the directory %s " % str1)

Get rid of parenthesis in output

I think this is an easy question for you as i am a beginner on python3.
When printing header of fasta file it contains parenthesis. How can i remove them ??
import sys
from Bio import Entrez
from Bio import SeqIO
#define email for entrez login
db = "nuccore"
Entrez.email = "someone#email.com"
#load accessions from arguments
if len(sys.argv[1:]) > 1:
accs = sys.argv[1:]
else: #load accesions from stdin
accs = [ l.strip() for l in sys.stdin if l.strip() ]
#fetch
sys.stderr.write( "Fetching %s entries from GenBank: %s\n" % (len(accs), ", ".join(accs[:10])))
for i,acc in enumerate(accs):
try:
sys.stderr.write( " %9i %s \r" % (i+1,acc))
handle = Entrez.efetch(db=db, rettype="fasta", id=acc)
seq_record = SeqIO.read(handle, "fasta")
if (len(seq_record.seq) > 0):
header = ">" + seq_record.description + " Len:" , len(seq_record.seq)
print(header)
print(seq_record.seq)
except:
sys.stderr.write( "Error! Cannot fetch: %s \n" % acc)
./acc2fasta.py 163345 303239
It will return
(">M69206.1 Bovine MHC class I AW10 mRNA (haplotype AW10), 3' end Len:", 1379)
TCCTGCTGCTCTCGGGGGTCCTGGTCCTGACCGAGACCCGGGCTGGCTCCCACTCGATGAGGTATTTCAGCACCGCCGTGTCCCGGCCCGGCCTCGGGGAGCCCCGGTACCTGGAAGTCGGCTACGTGGACGACACGCAGTTCGTGCGGTTTGACAGCGACGCCCCGAATCCGAGGATGGAGCCGCGGGCGCGGTGGGTGGAGCAGGAGGGGCCGGAGTATTGGGATCGGGAGACGCAAAGGGCCAAGGGCAACGCACAATTTTTCCGAGTGAGCCTGAACAACCTGCGCGGCTACTACAACCAGAGCGAGGCCGGGTCTCACACCCTCCAGTGGATGTCCGGCTGCTACGTGGGGCCGGACGGGCGTCCTCCGCGCGGGTTCATGCAGTTCGGCTACGACGGCAGAGATTACCTCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCGGTGGAGACGATGGCTCAGATCTCCAAACGCAAGATGGAGGCGGCCGGTGAAGCTGAGGTACAGAGGAACTACCTGGAGGGCCGGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACACGCTGCTGCGCGCAGACCCTCCAAAGGCACATGTGACCCGTCACCCGATCTCTGGTCGTGAGGTCACCCTGAGGTGCTGGGCCCTGGGCTTCTACCCTGAAGAGATCTCACTGACCTGGCAGCGCAATGGGGAGGACCAGACCCAGGACATGGAGCTTGTGGAGACCAGGCCTTCAGGGGACGGAAACTTCCAGAAGTGGGCGGCCCTGTTGGTGCCTTCTGGAGAGGAGCAGAAATACACATGCCAAGTGCAGCACGAGGGGCTTCAGGAGCCCCTCACCCTGAAATGGGAACCTCCTCAGCCCTCCTTCCTCACCATGGGCATCATTGTTGGCCTGGTTCTCCTCGTGGTCACTGGAGCTGTGGTGGCTGGAGTTGTGATCTGCATGAAGAAGCGCTCAGGTGAAAAACGAGGGACTTATATCCAGGCTTCAAGCAGTGACAGTGCCCAGGGCTCTGATGTGTCTCTCACGGTTCCTAAAGTGTGAGACACCTGCCTTCGGGGGACTGAGTGATGCTTCATCCCGCTATGTGACATCAGATCCCCGGAACCCCTTTTTCTGCAGCTGCATCTGAATGTGTCAGTGCCCCTATTCGCATAAGTAGGAGTTAGGGAGACTGGCCCACCCATGCCCACTGCTGCCCTTCCCCACTGCCGTCCCTCCCCACCCTGACCTGTGTTCTCTTCCCTGATCCACTGTCCTGTTCCAGCAGAGACGAGGCTGGACCATGTCTATCCCTGTCTTTGCTTTATATGCACTGAAAAATGATATCTTCTTTCCTTATTGAAAATAAAATCTGTC
Error! Cannot fetch: 303239
How to get rid of parenthesis in output ??
header = ">" + seq_record.description + " Len:" , len(seq_record.seq)
print(header)
You're printing the representation of the tuple by doing so, with commas (expected) but also parentheses (unrequired)
The best way would be to join the data instead, so comma is inserted between the string fields, but tuple representation is left out:
print(",".join(header))
In your case it's a little tricker, you have to convert non-string arguments to string (tuple representation did the conversion but join doesn't):
print(",".join([str(x) for x in header]))
result:
>M69206.1 Bovine MHC class I AW10 mRNA (haplotype AW10), 3' end Len:,1379

How to convert a set of osm files to shape files using ogr2ogr in python

I strongly believe that this question is already asked but I can't find the answer so I am placing it before you. I am having a problem while running the script to convert osm files to shp files. The script is reading all the osm files but just creating one shp file of the first osm file at the end instead of converting all the osm files. I am providing the code I used below. So please kindly help me in resolving me this.
from xml.dom import minidom
import os, sys
import xml.etree.ElementTree as ET
### ruta a gdal-data C:\Program Files (x86)\PostgreSQL\9.4\gdal-data
path = r"C:\Users\Administrator\Desktop\CHECKING\T2"
systemOutput = 'Shp'
print ("\n#### Execute python NY_osm2shapes")
print ("#### MONITORING CITIES")
print ("#### Conversor osm to shapes")
print ("#### OSM Path: " + path)
print "#### "
"""
Modify
Win: C:/Program Files/GDAL/gdal-data/osmconfig.ini
Linux: /usr/share/gdal/1.11/osmconfig.ini
report_all_ways=yes #activate lines without tag
attributes=landuse, plots #inside [lines]
attributes=landuse, plots #inside [multipolygons]
"""
### Check if path from argv
try:
if len(sys.argv) >= 2:
print("#### Path from argv: ", sys.argv[1])
path = sys.argv[1]
else:
print "#### Path set to", path
sys.exit()
except:
pass
#### Ogr config
print "\n#### Process: osm to shapes"
ogrOutputType = '' #-f "Esri Shapefile"'
ogrProjection = '' # -t_srs EPSG:4326' #+ epsg
ogrProjectionA = '' #-a_srs EPSG:3827'
ogrProjectionIn = '' #-s_srs EPSG:3827' #-t_srs EPSG:4326
ogrConfigType = ' --config OSM_USE_CUSTOM_INDEXING NO'
ogr2ogr = 'ogr2ogr %s %s %s %s %s %s -overwrite %s %s %s %s layer %s'
### Process
for l in os.walk(path):
archivos = l[2]
ruta = l[0]
for a in archivos:
if a.endswith(".osm"):
osmFile = os.path.join(ruta, a)
folder = os.path.join(ruta, systemOutput)
shapeFile = a[:-4]
ogrFileOutput = " -nln " + shapeFile
print "Archivo Shape: ", shapeFile,
layerType = shapeFile[-1]
if layerType=="0":
print "\t TIPO 0: Circles"
ogrSelectLayer = "lines"
ogrLcoType = ' -lco SHPT=ARC'
ogrSelect = ' -select ID_string'
elif layerType == "1":
print "\t TIPO 1: Blocks"
ogrSelectLayer = "lines"
ogrLcoType = ' -lco SHPT=ARC'
ogrSelect = ' -select Land_use'
elif layerType == "2":
print "\t TIPO 2: Plots"
ogrSelectLayer = "lines"
ogrLcoType = ' -lco SHPT=ARC'
ogrSelect = ' -select Plot'
elif layerType == "3":
print "\t TIPO 3: Medians"
ogrSelectLayer = "lines"
ogrLcoType = ' -lco SHPT=ARC'
ogrSelect = ' -select ID_string'
else:
print "ELSE ERROR*"
systemOutput = ogr2ogr % (ogrOutputType, folder, osmFile, ogrProjectionA, ogrProjectionIn, ogrProjection, ogrFileOutput, ogrLcoType, ogrConfigType, ogrSelect, ogrSelectLayer)
#print ("Fichero: ", osmFile, shapeFile, layerType, ogrSelectLayer)
os.system(systemOutput)
print "End process"
The way you used os.walk returns in archivos all osm files in the last ruta of the tree structure traversed. That is possibly (at least part of) your problem, or it may be so in the future.
You have to use os.walk differently:
import os, re
ext_regx = '\.osm$'
archivos = []
for ruta, dirs, archs in os.walk( path ) :
for arch in archs :
if re.search( ext_regx, arch ) :
archivos.append( os.path.join( ruta, arch ) )
for osmFile in archivos :
print( osmFile )
...
Now if the code inside the for loop does not do what you mean to, that is another issue.
I suggest you:
Add print( systemOutput ) to check that each command executed is what you intend it to be.
Check that the files and dirs refered to in that command are correct.
PS: each item in archivos will already contain the dir part, so you have to split the folder part, instead of joining.
PS2: you might need to use double backslashes for dirs. Also, bear in mind os.sep.

Python - Is this code lacking List Comprehensions and Generators [closed]

It's difficult to tell what is being asked here. This question is ambiguous, vague, incomplete, overly broad, or rhetorical and cannot be reasonably answered in its current form. For help clarifying this question so that it can be reopened, visit the help center.
Closed 10 years ago.
This is my first question, and I apologize if its a bit long on the code-example side.
As part of a job application I was asked to write a Bit Torrent file parser that exposed some of the fields. I did the code, and was told my code was "not quite at the level that we require from a team lead". Ouch!
That's fine its, been years since I have coded, and list comprehensions, generators did not exist back in the day (I started with COBOL, but have coded with C, C++, etc). To me the below code is very clean. Sometimes there is no need to use more complex structures, syntax or patterns - "Keep it Simple".
Could I ask some Python guru's to critique this code please? I'm believe it is useful to others to see where the code could be improved. There were more comments, etc (the bencode.py is from http://wiki.theory.org/Decoding_bencoded_data_with_python )
The areas I can think of:
in the display_* methods to use list comprehensions to avoid the string of "if's"better
list comprehension / generator usage
bad use of globals
stdin/stdout/piping? This was a simple assignment, so I thought it was not necessary.
I was personally proud of this code, so would like to know where I need to improve. Thanks.
#!/usr/bin/env python2
"""Bit Torrent Parsing
Parses a Bit Torrent file.
A basic parser for Bit Torrent files. Visit http://wiki.theory.org/BitTorrentSpecification for the BitTorrent specification.
"""
__author__ = "...."
__version__ = "$Revision: 1.0 $"
__date__ = "$Date: 2012/10/26 11:08:46 $"
__copyright__ = "Enjoy & Distribute"
__license__ = "Python"
import bencode
import argparse
from argparse import RawTextHelpFormatter
import binascii
import time
import os
import pprint
torrent_files = 0
torrent_pieces = 0
def display_root(filename, root):
"""prints main (root) information on torrent"""
global torrent_files
global torrent_pieces
print
print "Bit Torrent Metafile Structure root nodes:"
print "------------------------------------------"
print "Torrent filename: ", filename
print " Info: %d file(s), %d pieces, ~%d kb/pc" % (
torrent_files,
torrent_pieces,
root['info']['piece length'] / 1024)
if 'private' in root['info']:
if root['info']['private'] == 1:
print " Publish presence: Private"
print " Announce: ", root['announce']
if 'announce-list' in root:
print " Announce List: "
for i in root['announce-list']:
print " ", i[0]
if 'creation date' in root:
print " Creation Date: ", time.ctime(root['creation date'])
if 'comment' in root:
print " Comment: ", root['comment']
if 'created-by' in root:
print " Created-By: ", root['created-by']
print " Encoding: ", root['encoding']
print
def display_torrent_file(info):
"""prints file information (single or multifile)"""
global torrent_files
global torrent_pieces
if 'files' in info:
# multipart file mode
# directory, followed by filenames
print "Files:"
max_files = args.maxfiles
display = max_files if (max_files < torrent_files) else torrent_files
print " %d File %d shown: " % (torrent_files, display)
print " Directory: ", info['name']
print " Filenames:"
i = 0
for files in info['files']:
if i < max_files:
prefix = ''
if len(files['path']) > 1:
prefix = './'
filename = prefix + '/'.join(files['path'])
if args.filehash:
if 'md5sum' in files:
md5hash = binascii.hexlify(files['md5sum'])
else:
md5hash = 'n/a'
print ' %s [hash: %s]' % (filename, md5hash)
else:
print ' %s ' % filename
i += 1
else:
break
else:
# single file mode
print "Filename: ", info['name']
print
def display_pieces(pieceDict):
"""prints SHA1 hash for pieces, limited by arg pieces"""
global torrent_files
global torrent_pieces
# global pieceDict
# limit since a torrent file can have 1,000's of pieces
max_pieces = args.pieces if args.pieces else 10
print "Pieces:"
print " Torrent contains %s pieces, %d shown."% (
torrent_pieces, max_pieces)
print " piece : sha1"
i = 0
while i < max_pieces and i < torrent_pieces:
# print SHA1 hash in readable hex format
print ' %5d : %s' % (i+1, binascii.hexlify(pieceDict[i]))
i += 1
def parse_pieces(root):
"""create dictionary [ piece-num, hash ] from info's pieces
Returns the pieces dictionary. key is the piece number, value is the
SHA1 hash value (20-bytes)
Keyword arguments:
root -- a Bit Torrent Metafile root dictionary
"""
global torrent_pieces
pieceDict = {}
i = 0
while i < torrent_pieces:
pieceDict[i] = root['info']['pieces'][(20*i):(20*i)+20]
i += 1
return pieceDict
def parse_root_str(root_str):
"""create dictionary [ piece-num, hash ] from info's pieces
Returns the complete Bit Torrent Metafile Structure dictionary with
relevant Bit Torrent Metafile nodes and their values.
Keyword arguments:
root_str -- a UTF-8 encoded string with root-level nodes (e.g., info)
"""
global torrent_files
global torrent_pieces
try:
torrent_root = bencode.decode(root_str)
except StandardError:
print 'Error in torrent file, likely missing separators like ":"'
if 'files' in torrent_root['info']:
torrent_files = len(torrent_root['info']['files'])
else:
torrent_files = 1
torrent_pieces = len(torrent_root['info']['pieces']) / 20
torrent_piece = parse_pieces(torrent_root)
return torrent_root, torrent_piece
def readfile(filename):
"""read file and return file's data"""
global torrent_files
global torrent_pieces
if os.path.exists(filename):
with open(filename, mode='rb') as f:
filedata = f.read()
else:
print "Error: filename: '%s' does not exist." % filename
raise IOError("Filename not found.")
return filedata
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter,
description=
"A basic parser for Bit Torrent files. Visit "
"http://wiki.theory.org/BitTorrentSpecification for "
"the BitTorrent specification.",
epilog=
"The keys for the Bit Torrent MetaInfo File Structure "
"are info, announce, announce-list, creation date, comment, "
"created by and encoding. \n"
"The Info Dictionary (info) is dependant on whether the torrent "
"file is a single or multiple file. The keys common to both "
"are piece length, pieces and private.\nFor single files, the "
"additional keys are name, length and md5sum.For multiple files "
"the keys are, name and files. files is also a dictionary with "
"keys length, md5sum and path.\n\n"
"Examples:\n"
"torrentparse.py --string 'l4:dir14:dir28:file.exte'\n"
"torrentparse.py --filename foo.torrent\n"
"torrentparse.py -f foo.torrent -f bar.torrent "
"--maxfiles 2 --filehash --pieces 2 -v")
filegroup = parser.add_argument_group('Input File or String')
filegroup.add_argument("-f", "--filename",
help="name of torrent file to parse",
action='append')
filegroup.add_argument("-fh", "--filehash",
help="display file's MD5 hash",
action = "store_true")
filegroup.add_argument("-maxf", "--maxfiles",
help="display X filenames (default=20)",
metavar = 'X',
type=int, default=20)
piecegroup = parser.add_argument_group('Torrent Pieces')
piecegroup.add_argument("-p", "--pieces",
help = "display X piece's SHA1 hash (default=10)",
metavar = 'X',
type = int)
parser.add_argument("-s", "--string",
help="string for bencoded dictionary item")
parser.add_argument("-v", "--verbose",
help = "Display MetaInfo file to stdout",
action = "store_true")
args = parser.parse_args()
if args.string:
print
text = bencode.decode(args.string)
print text
else:
for fn in args.filename:
try:
filedata = readfile(fn)
torrent_root, torrent_piece = parse_root_str(filedata)
except IOError:
print "Please enter a valid filename"
raise
if torrent_root:
display_root(fn, torrent_root)
display_torrent_file(torrent_root['info'])
if args.pieces:
display_pieces(torrent_piece)
verbose = True if args.verbose else False
if verbose:
print
print "Verbose Mode: \nPrinting root and info dictionaries"
# remove pieces as its long. display it afterwards
pieceless_root = torrent_root
del pieceless_root['info']['pieces']
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(pieceless_root)
print
print "Print info's piece information: "
pp.pprint(torrent_piece)
print
print "\n"
The following snippet:
i = 0
while i < torrent_pieces:
pieceDict[i] = root['info']['pieces'][(20*i):(20*i)+20]
i += 1
should be replaced by:
for i in range(torrent_pieces):
pieceDict[i] = root['info']['pieces'][(20*i):(20*i)+20]
That might be the kind of thing they want to see. In general, Python code shouldn't need explicit index variable manipulation in for loops very much.
The first thing I notice is that you've got a lot of global variables. That's no good; your code is no longer threadsafe, for one problem. (I see now that you noted that in your question, but that is something that should be changed.)
This looks a little odd:
i = 0
for files in info['files']:
if i < max_files:
# ...
else:
break
Instead, you could just do this:
for file in info['files'][:max_files]:
# ...
I also notice that you parse the file just enough to output all of the data pretty-printed. You might want to put it into appropriate structures. For example, have Torrent, Piece, and File classes.

BioPython: extracting sequence IDs from a Blast output file

I have a BLAST output file in XML format. It is 22 query sequences with 50 hits reported from each sequence. And I want to extract all the 50x22 hits. This is the code I currently have, but it only extracts the 50 hits from the first query.
from Bio.Blast import NCBIXM
blast_records = NCBIXML.parse(result_handle)
blast_record = blast_records.next()
save_file = open("/Users/jonbra/Desktop/my_fasta_seq.fasta", 'w')
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
save_file.write('>%s\n' % (alignment.title,))
save_file.close()
Somebody have any suggestions as to extract all the hits? I guess I have to use something else than alignments.
Hope this was clear. Thanks!
Jon
This should get all records. The novelty compared with the original is the
for blast_record in blast_records
which is a python idiom to iterate through items in a "list-like" object, such as the blast_records (checking the CBIXML module documentation showed that parse() indeed returns an iterator)
from Bio.Blast import NCBIXM
blast_records = NCBIXML.parse(result_handle)
save_file = open("/Users/jonbra/Desktop/my_fasta_seq.fasta", 'w')
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
save_file.write('>%s\n' % (alignment.title,))
#here possibly to output something to file, between each blast_record
save_file.close()
I used this code for extract all the results
from Bio.Blast import NCBIXML
for record in NCBIXML.parse(open("rpoD.xml")) :
print "QUERY: %s" % record.query
for align in record.alignments :
print " MATCH: %s..." % align.title[:60]
for hsp in align.hsps :
print " HSP, e=%f, from position %i to %i" \
% (hsp.expect, hsp.query_start, hsp.query_end)
if hsp.align_length < 60 :
print " Query: %s" % hsp.query
print " Match: %s" % hsp.match
print " Sbjct: %s" % hsp.sbjct
else :
print " Query: %s..." % hsp.query[:57]
print " Match: %s..." % hsp.match[:57]
print " Sbjct: %s..." % hsp.sbjct[:57]
print "Done"
or for less details
from Bio.Blast import NCBIXML
for record in NCBIXML.parse(open("NC_003197.xml")) :
#We want to ignore any queries with no search results:
if record.alignments :
print "QUERY: %s..." % record.query[:60]
for align in record.alignments :
for hsp in align.hsps :
print " %s HSP, e=%f, from position %i to %i" \
% (align.hit_id, hsp.expect, hsp.query_start, hsp.query_end)
print "Done"
I used this site
http://www2.warwick.ac.uk/fac/sci/moac/currentstudents/peter_cock/python/rpsblast/

Categories