Get rid of parenthesis in output - python

I think this is an easy question for you as i am a beginner on python3.
When printing header of fasta file it contains parenthesis. How can i remove them ??
import sys
from Bio import Entrez
from Bio import SeqIO
#define email for entrez login
db = "nuccore"
Entrez.email = "someone#email.com"
#load accessions from arguments
if len(sys.argv[1:]) > 1:
accs = sys.argv[1:]
else: #load accesions from stdin
accs = [ l.strip() for l in sys.stdin if l.strip() ]
#fetch
sys.stderr.write( "Fetching %s entries from GenBank: %s\n" % (len(accs), ", ".join(accs[:10])))
for i,acc in enumerate(accs):
try:
sys.stderr.write( " %9i %s \r" % (i+1,acc))
handle = Entrez.efetch(db=db, rettype="fasta", id=acc)
seq_record = SeqIO.read(handle, "fasta")
if (len(seq_record.seq) > 0):
header = ">" + seq_record.description + " Len:" , len(seq_record.seq)
print(header)
print(seq_record.seq)
except:
sys.stderr.write( "Error! Cannot fetch: %s \n" % acc)
./acc2fasta.py 163345 303239
It will return
(">M69206.1 Bovine MHC class I AW10 mRNA (haplotype AW10), 3' end Len:", 1379)
TCCTGCTGCTCTCGGGGGTCCTGGTCCTGACCGAGACCCGGGCTGGCTCCCACTCGATGAGGTATTTCAGCACCGCCGTGTCCCGGCCCGGCCTCGGGGAGCCCCGGTACCTGGAAGTCGGCTACGTGGACGACACGCAGTTCGTGCGGTTTGACAGCGACGCCCCGAATCCGAGGATGGAGCCGCGGGCGCGGTGGGTGGAGCAGGAGGGGCCGGAGTATTGGGATCGGGAGACGCAAAGGGCCAAGGGCAACGCACAATTTTTCCGAGTGAGCCTGAACAACCTGCGCGGCTACTACAACCAGAGCGAGGCCGGGTCTCACACCCTCCAGTGGATGTCCGGCTGCTACGTGGGGCCGGACGGGCGTCCTCCGCGCGGGTTCATGCAGTTCGGCTACGACGGCAGAGATTACCTCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCGGTGGAGACGATGGCTCAGATCTCCAAACGCAAGATGGAGGCGGCCGGTGAAGCTGAGGTACAGAGGAACTACCTGGAGGGCCGGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACACGCTGCTGCGCGCAGACCCTCCAAAGGCACATGTGACCCGTCACCCGATCTCTGGTCGTGAGGTCACCCTGAGGTGCTGGGCCCTGGGCTTCTACCCTGAAGAGATCTCACTGACCTGGCAGCGCAATGGGGAGGACCAGACCCAGGACATGGAGCTTGTGGAGACCAGGCCTTCAGGGGACGGAAACTTCCAGAAGTGGGCGGCCCTGTTGGTGCCTTCTGGAGAGGAGCAGAAATACACATGCCAAGTGCAGCACGAGGGGCTTCAGGAGCCCCTCACCCTGAAATGGGAACCTCCTCAGCCCTCCTTCCTCACCATGGGCATCATTGTTGGCCTGGTTCTCCTCGTGGTCACTGGAGCTGTGGTGGCTGGAGTTGTGATCTGCATGAAGAAGCGCTCAGGTGAAAAACGAGGGACTTATATCCAGGCTTCAAGCAGTGACAGTGCCCAGGGCTCTGATGTGTCTCTCACGGTTCCTAAAGTGTGAGACACCTGCCTTCGGGGGACTGAGTGATGCTTCATCCCGCTATGTGACATCAGATCCCCGGAACCCCTTTTTCTGCAGCTGCATCTGAATGTGTCAGTGCCCCTATTCGCATAAGTAGGAGTTAGGGAGACTGGCCCACCCATGCCCACTGCTGCCCTTCCCCACTGCCGTCCCTCCCCACCCTGACCTGTGTTCTCTTCCCTGATCCACTGTCCTGTTCCAGCAGAGACGAGGCTGGACCATGTCTATCCCTGTCTTTGCTTTATATGCACTGAAAAATGATATCTTCTTTCCTTATTGAAAATAAAATCTGTC
Error! Cannot fetch: 303239
How to get rid of parenthesis in output ??

header = ">" + seq_record.description + " Len:" , len(seq_record.seq)
print(header)
You're printing the representation of the tuple by doing so, with commas (expected) but also parentheses (unrequired)
The best way would be to join the data instead, so comma is inserted between the string fields, but tuple representation is left out:
print(",".join(header))
In your case it's a little tricker, you have to convert non-string arguments to string (tuple representation did the conversion but join doesn't):
print(",".join([str(x) for x in header]))
result:
>M69206.1 Bovine MHC class I AW10 mRNA (haplotype AW10), 3' end Len:,1379

Related

Python Regular Expression / Middle word in result

I have problem with unnecessary strings in result. I want pull only https from files.
My code is:
import sys
import os
import hashlib
import re
if len(sys.argv) < 2 :
sys.exit('Aby uzyc wpisz: python %s filename' % sys.argv[0])
if not os.path.exists(sys.argv[1]):
sys.exit('BLAD!: Plik "%s" nie znaleziony!' % sys.argv[1])
with open(sys.argv[1], 'rb') as f:
plik = f.read()
print("MD5: %s" % hashlib.md5(plik).hexdigest())
print("SHA1: %s" % hashlib.sha1(plik).hexdigest())
print("SHA256: %s" % hashlib.sha256(plik).hexdigest())
print("Podejrzane linki: \n")
pliki = open(sys.argv[1], 'r')
for line in pliki:
if re.search("(H|h)ttps:(.*)",line):
print(line)
elif re.search("(H|h)ttp:(.*)",line):
print(line)
pliki.close()
In result:
MD5: f16a93fd2d6f2a9f90af9f61a19d28bd
SHA1: 0a9b89624696757e188412da268afb2bf5b600aa
SHA256: 3b365deb0e272146f00f9d723a9fd4dbeacddc10123aec8237a37c10c19fe6df
Podejrzane linki:
GrizliPolSurls = "http://xxx.xxx.xxx.xxx"
FilnMoviehttpsd.Open "GET", "https://xxx.xxx.xxx.xxx",False
I want only strings in "" and starts from http or https e.g http://xxx.xxx.xxx.xxx
Desired result:
MD5: f16a93fd2d6f2a9f90af9f61a19d28bd
SHA1: 0a9b89624696757e188412da268afb2bf5b600aa
SHA256: 3b365deb0e272146f00f9d723a9fd4dbeacddc10123aec8237a37c10c19fe6df
Podejrzane linki:
http://xxx.xxx.xxx.xxx
https://xxx.xxx.xxx.xxx
You can use re.findall with the following regex (explained on regex101):
"([Hh]ttps?.*?)"
so:
import re
s = '''MD5MD5:: f16a93fd2d6f2a9f90af9f61a19d28bd
SHA1 f16a93fd2 : 0a9b89624696757e188412da268afb2bf5b600aa
SHA256: 3b365deb0e272146f00f9d723a9fd4dbeacddc10123aec8237a37c10c19fe6df
Podejrzane linki:
GrizliPolSurls = "http://xxx.xxx.xxx.xxx"
FilnMoviehttpsd.Open "GET", "https://xxx.xxx.xxx.xxx",False'''
urls = re.findall('"([Hh]ttps?.*?)"', s)
#['http://xxx.xxx.xxx.xxx', 'https://xxx.xxx.xxx.xxx']
You need this pattern: (?<=")http[^"]+.
(?<=") - positive lookbehind, to determine if " precceds current position.
http - match http literally.
[^"]+ - match everything until ", this is negated class technique to avoid quantifiers :)
Demo
re.search() returns a Match Object
You have to fetch the information from the result:
line = "my text line contains a http://192.168.1.1 magic url"
result = re.search("[Hh]ttps?://\d+\.\d+\.\d+\.\d+", line)
print(result.group()) # will print http://192.168.1.1

find and replace regular expression rather than full string

I've loaded a dictionary of "regex":"picture" pairs parsed from a json.
These values are intended to match the regex within a message string and replace it with the picture for display in a flash plugin that displays HTML text.
for instance typing:
Hello MVGame everyone.
Would return:
Hello <img src='http://static-cdn.jtvnw.net/jtv_user_pictures/chansub-global-emoticon-1a1a8bb5cdf6efb9-24x32.png' height = '32' width = '24'> everyone.
However:
If I type,
Hello :) everyone.
it will not parse the :) because this is encoded as a regular expression "\\:-?\\)" rather than just a string match.
How do I get it to parse the regular expression as the matching parameter?
Here is my test code:
# regular expression test
import urllib
import json # for loading json's for emoticons
import urllib.request # more for loadings jsons from urls
import re # allows pattern filtering for emoticons
def loademotes():
#Create emoteicon dictionary
try:
print ("Trying to load emoteicons from twitch")
response = urllib.request.urlopen('https://api.twitch.tv/kraken/chat/emoticons').read()
mydata = json.loads(response.decode('utf-8'))
for idx,item in enumerate(mydata['emoticons']):
regex = item['regex']
url = "<img src='" + item['images'][0]['url'] + "'" + " height = '" + str(item['images'][0]['height']) + "'" + " width = '" + str(item['images'][0]['width']) + "' >"
emoticonDictionary[regex] = url
print ("All emoteicons loaded")
except IOError as e:
print ("I/O error({0}) : {1}".format(e.errno, e.strerror))
print ("Cannot load emoteicons.")
emoticonDictionary = {} # create emoticon dictionary indexed by words returns url in html image tags
loademotes()
while 1:
myString = input ("Here you type something : ")
pattern = re.compile(r'\b(' + '|'.join(emoticonDictionary.keys()) + r')\b')
results = pattern.sub(lambda x: emoticonDictionary[x.group()], myString)
print (results)
I think you could make sure each syntactic character in regular expressions is surrounded by character classes before you feed it to the re. Like write something that takes :) and makes it [:][)]

Double quote string manipulation

I have some input data from ASCII files which uses double quote to encapsulate string as well as still use double quote inside those strings, for example:
"Reliable" "Africa" 567.87 "Bob" "" "" "" "S 05`56'21.844"" "No Shift"
Notice the double quote used in the coordinate.
So I have been using:
valList = shlex.split(line)
But shlex get's confused with the double quote used as the second in the coordinate.
I've been doing a find and replace on '\"\"' to '\\\"\"'. This of course turns an empty strings to \"" as well so I do a find and replace on (this time with spaces) ' \\\"\" ' to ' \"\"" '. Not exactly the most efficient way of doing it!
Any suggestions on handling this double quote in the coordinate?
I would do it this way:
I would treat this line of text as a csv file. Then according to RFC 4180 :
If double-quotes are used to enclose fields, then a double-quote
appearing inside a field must be escaped by preceding it with
another double quote. For example:
"aaa","b""bb","ccc"
Then all you would need to do is to add another " to your coordinates. So it would look like this "S 0556'21.844"""(NOTE extra quote) Then you can use a standartcsv` module to break it apart and extract necessary information.
>>> from StringIO import StringIO
>>> import csv
>>>
>>> test = '''"Reliable" "Africa" 567.87 "Bob" "" "" "" "S 05`56'21.844""" "No Shift"'''
>>> test_obj = StringIO(test)
>>> reader = csv.reader(test_obj, delimiter=' ', quotechar='"', quoting=csv.QUOTE_ALL)
>>> for i in reader:
... print i
...
The output would be :
['Reliable', 'Africa', '567.87', 'Bob', '', '', '', 'S 05`56\'21.844"', 'No Shift']
I'm not good with regexes, but this non-regex suggestion might help ...
INPUT = ('"Reliable" "Africa" 567.87 "Bob" "" "" "" "S 05`56'
"'"
'21.844"" "No Shift"')
def main(input):
output = input
surrounding_quote_symbol = '<!>'
if input.startswith('"'):
output = '%s%s' % (surrounding_quote_symbol, output[1:])
if input.endswith('"'):
output = '%s%s' % (output[:-1], surrounding_quote_symbol)
output = output.replace('" ', '%s ' % surrounding_quote_symbol)
output = output.replace(' "', ' %s' % surrounding_quote_symbol)
print "Stage 1:", output
output = output.replace('"', '\"')
output = output.replace(surrounding_quote_symbol, '"')
return output
if __name__ == "__main__":
output = main(INPUT)
print "End results:", output

Removing \r and \n from list

I'm trying to remove \r and \n from a urban dictionary json api but everytime I use re.sub I get this:
expected string or buffer
I'm not sure why though, but here's the code:
elif used_prefix and cmd == "udi" and len(args) > 0 and self.getAccess(user) >= 1:
try:
f = urllib.request.urlopen("http://api.urbandictionary.com/v0/define?term=%s" % args.lower().replace(' ', '+'))
data = json.loads(f.readall().decode("utf-8"))
data = re.sub(r'\s+', ' ', data).replace("\\","")
if (len(data['list']) > 0):
definition = data['list'][0][u'definition']
example = data['list'][0][u'example']
permalink = data['list'][0][u'permalink']
room.message("Urban Dictionary search for %s: %s Example: %s Link: %s" % (args.title(), definition, example, permalink), True)
else: room.message("Word not found.")
except:
room.message((str(sys.exc_info()[1])))
print(traceback.format_exc())
This is the traceback:
Traceback (most recent call last): File "C:\Users\dell\Desktop\b0t\TutorialBot.py", line 2186, in onMessage data = re.sub(r'\s+', ' ', data).replace("\\","") File "C:\lib\re.py", line 170, in sub return _compile(pattern, flags).sub(repl, string, count) TypeError: expected string or buffer
The problem is that you are trying to use re.sub on a dict rather than a string. Further, your code seems to be a little messy in places. Try this instead:
import urllib2
import json
import re
def test(*args):
f = urllib2.urlopen("http://api.urbandictionary.com/v0/define?term=%s" % '+'.join(args).lower()) # note urllib2.urlopen rather than urllib.request.urlopen
data = json.loads(f.read().decode("utf-8")) # note f.read() instead of f.readall()
if len(data['list']) > 0:
definition = data['list'][0][u'definition']
example = data['list'][0][u'example']
permalink = data['list'][0][u'permalink']
return "Urban Dictionary search for %s: %s Example: %s Link: %s" % (str(args), definition, example, permalink) # returns a string
print test('mouth', 'hugging').replace('\n\n', '\n') # prints the string after replacing '\n\n' with '\n'
The result:
Urban Dictionary search for ('mouth', 'hugging'): When you put a beer bottle in your mouth, and keep your mouth wrapped around it all day. Example: Josh: "mhmgdfhwrmhhh (attempts to talk while drinking a beer)"
Ryan: "You know I can't hear you when you're mouth hugging."
Josh: "mmmffwrrggddsshh" Link: http://mouth-hugging.urbanup.com/7493517

Python - Is this code lacking List Comprehensions and Generators [closed]

It's difficult to tell what is being asked here. This question is ambiguous, vague, incomplete, overly broad, or rhetorical and cannot be reasonably answered in its current form. For help clarifying this question so that it can be reopened, visit the help center.
Closed 10 years ago.
This is my first question, and I apologize if its a bit long on the code-example side.
As part of a job application I was asked to write a Bit Torrent file parser that exposed some of the fields. I did the code, and was told my code was "not quite at the level that we require from a team lead". Ouch!
That's fine its, been years since I have coded, and list comprehensions, generators did not exist back in the day (I started with COBOL, but have coded with C, C++, etc). To me the below code is very clean. Sometimes there is no need to use more complex structures, syntax or patterns - "Keep it Simple".
Could I ask some Python guru's to critique this code please? I'm believe it is useful to others to see where the code could be improved. There were more comments, etc (the bencode.py is from http://wiki.theory.org/Decoding_bencoded_data_with_python )
The areas I can think of:
in the display_* methods to use list comprehensions to avoid the string of "if's"better
list comprehension / generator usage
bad use of globals
stdin/stdout/piping? This was a simple assignment, so I thought it was not necessary.
I was personally proud of this code, so would like to know where I need to improve. Thanks.
#!/usr/bin/env python2
"""Bit Torrent Parsing
Parses a Bit Torrent file.
A basic parser for Bit Torrent files. Visit http://wiki.theory.org/BitTorrentSpecification for the BitTorrent specification.
"""
__author__ = "...."
__version__ = "$Revision: 1.0 $"
__date__ = "$Date: 2012/10/26 11:08:46 $"
__copyright__ = "Enjoy & Distribute"
__license__ = "Python"
import bencode
import argparse
from argparse import RawTextHelpFormatter
import binascii
import time
import os
import pprint
torrent_files = 0
torrent_pieces = 0
def display_root(filename, root):
"""prints main (root) information on torrent"""
global torrent_files
global torrent_pieces
print
print "Bit Torrent Metafile Structure root nodes:"
print "------------------------------------------"
print "Torrent filename: ", filename
print " Info: %d file(s), %d pieces, ~%d kb/pc" % (
torrent_files,
torrent_pieces,
root['info']['piece length'] / 1024)
if 'private' in root['info']:
if root['info']['private'] == 1:
print " Publish presence: Private"
print " Announce: ", root['announce']
if 'announce-list' in root:
print " Announce List: "
for i in root['announce-list']:
print " ", i[0]
if 'creation date' in root:
print " Creation Date: ", time.ctime(root['creation date'])
if 'comment' in root:
print " Comment: ", root['comment']
if 'created-by' in root:
print " Created-By: ", root['created-by']
print " Encoding: ", root['encoding']
print
def display_torrent_file(info):
"""prints file information (single or multifile)"""
global torrent_files
global torrent_pieces
if 'files' in info:
# multipart file mode
# directory, followed by filenames
print "Files:"
max_files = args.maxfiles
display = max_files if (max_files < torrent_files) else torrent_files
print " %d File %d shown: " % (torrent_files, display)
print " Directory: ", info['name']
print " Filenames:"
i = 0
for files in info['files']:
if i < max_files:
prefix = ''
if len(files['path']) > 1:
prefix = './'
filename = prefix + '/'.join(files['path'])
if args.filehash:
if 'md5sum' in files:
md5hash = binascii.hexlify(files['md5sum'])
else:
md5hash = 'n/a'
print ' %s [hash: %s]' % (filename, md5hash)
else:
print ' %s ' % filename
i += 1
else:
break
else:
# single file mode
print "Filename: ", info['name']
print
def display_pieces(pieceDict):
"""prints SHA1 hash for pieces, limited by arg pieces"""
global torrent_files
global torrent_pieces
# global pieceDict
# limit since a torrent file can have 1,000's of pieces
max_pieces = args.pieces if args.pieces else 10
print "Pieces:"
print " Torrent contains %s pieces, %d shown."% (
torrent_pieces, max_pieces)
print " piece : sha1"
i = 0
while i < max_pieces and i < torrent_pieces:
# print SHA1 hash in readable hex format
print ' %5d : %s' % (i+1, binascii.hexlify(pieceDict[i]))
i += 1
def parse_pieces(root):
"""create dictionary [ piece-num, hash ] from info's pieces
Returns the pieces dictionary. key is the piece number, value is the
SHA1 hash value (20-bytes)
Keyword arguments:
root -- a Bit Torrent Metafile root dictionary
"""
global torrent_pieces
pieceDict = {}
i = 0
while i < torrent_pieces:
pieceDict[i] = root['info']['pieces'][(20*i):(20*i)+20]
i += 1
return pieceDict
def parse_root_str(root_str):
"""create dictionary [ piece-num, hash ] from info's pieces
Returns the complete Bit Torrent Metafile Structure dictionary with
relevant Bit Torrent Metafile nodes and their values.
Keyword arguments:
root_str -- a UTF-8 encoded string with root-level nodes (e.g., info)
"""
global torrent_files
global torrent_pieces
try:
torrent_root = bencode.decode(root_str)
except StandardError:
print 'Error in torrent file, likely missing separators like ":"'
if 'files' in torrent_root['info']:
torrent_files = len(torrent_root['info']['files'])
else:
torrent_files = 1
torrent_pieces = len(torrent_root['info']['pieces']) / 20
torrent_piece = parse_pieces(torrent_root)
return torrent_root, torrent_piece
def readfile(filename):
"""read file and return file's data"""
global torrent_files
global torrent_pieces
if os.path.exists(filename):
with open(filename, mode='rb') as f:
filedata = f.read()
else:
print "Error: filename: '%s' does not exist." % filename
raise IOError("Filename not found.")
return filedata
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter,
description=
"A basic parser for Bit Torrent files. Visit "
"http://wiki.theory.org/BitTorrentSpecification for "
"the BitTorrent specification.",
epilog=
"The keys for the Bit Torrent MetaInfo File Structure "
"are info, announce, announce-list, creation date, comment, "
"created by and encoding. \n"
"The Info Dictionary (info) is dependant on whether the torrent "
"file is a single or multiple file. The keys common to both "
"are piece length, pieces and private.\nFor single files, the "
"additional keys are name, length and md5sum.For multiple files "
"the keys are, name and files. files is also a dictionary with "
"keys length, md5sum and path.\n\n"
"Examples:\n"
"torrentparse.py --string 'l4:dir14:dir28:file.exte'\n"
"torrentparse.py --filename foo.torrent\n"
"torrentparse.py -f foo.torrent -f bar.torrent "
"--maxfiles 2 --filehash --pieces 2 -v")
filegroup = parser.add_argument_group('Input File or String')
filegroup.add_argument("-f", "--filename",
help="name of torrent file to parse",
action='append')
filegroup.add_argument("-fh", "--filehash",
help="display file's MD5 hash",
action = "store_true")
filegroup.add_argument("-maxf", "--maxfiles",
help="display X filenames (default=20)",
metavar = 'X',
type=int, default=20)
piecegroup = parser.add_argument_group('Torrent Pieces')
piecegroup.add_argument("-p", "--pieces",
help = "display X piece's SHA1 hash (default=10)",
metavar = 'X',
type = int)
parser.add_argument("-s", "--string",
help="string for bencoded dictionary item")
parser.add_argument("-v", "--verbose",
help = "Display MetaInfo file to stdout",
action = "store_true")
args = parser.parse_args()
if args.string:
print
text = bencode.decode(args.string)
print text
else:
for fn in args.filename:
try:
filedata = readfile(fn)
torrent_root, torrent_piece = parse_root_str(filedata)
except IOError:
print "Please enter a valid filename"
raise
if torrent_root:
display_root(fn, torrent_root)
display_torrent_file(torrent_root['info'])
if args.pieces:
display_pieces(torrent_piece)
verbose = True if args.verbose else False
if verbose:
print
print "Verbose Mode: \nPrinting root and info dictionaries"
# remove pieces as its long. display it afterwards
pieceless_root = torrent_root
del pieceless_root['info']['pieces']
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(pieceless_root)
print
print "Print info's piece information: "
pp.pprint(torrent_piece)
print
print "\n"
The following snippet:
i = 0
while i < torrent_pieces:
pieceDict[i] = root['info']['pieces'][(20*i):(20*i)+20]
i += 1
should be replaced by:
for i in range(torrent_pieces):
pieceDict[i] = root['info']['pieces'][(20*i):(20*i)+20]
That might be the kind of thing they want to see. In general, Python code shouldn't need explicit index variable manipulation in for loops very much.
The first thing I notice is that you've got a lot of global variables. That's no good; your code is no longer threadsafe, for one problem. (I see now that you noted that in your question, but that is something that should be changed.)
This looks a little odd:
i = 0
for files in info['files']:
if i < max_files:
# ...
else:
break
Instead, you could just do this:
for file in info['files'][:max_files]:
# ...
I also notice that you parse the file just enough to output all of the data pretty-printed. You might want to put it into appropriate structures. For example, have Torrent, Piece, and File classes.

Categories