When I run the following script, the error"Command line argument error: Argument "query". File is not accessible" occurs. I'm using python 3.4.2.
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import subprocess
import tempfile
import sys
def main():
# read name file and put all identifications into a list
infile_I = open('OTU_name.txt','r')
name = infile_I.read().split('>')
infile_I.close()
# extract sequence segments to a temporary file one at a time
for i in name:
i = i.replace('\n','')
for j in SeqIO.parse("GemSIM_OTU_ids.fa","fasta"):
if str(i) == str(j.id):
f = tempfile.NamedTemporaryFile()
record = j.seq
f.write(bytes(str(record),'UTF-8'))
f.seek(0)
f = f.read().decode()
Result = subprocess.Popen(['blastn','-remote','-db','chromosome','-query',f,'-out',str(i)],stdout=subprocess.PIPE)
output = Result.communicate()[0]
if __name__== '__main__': main()
f = tempfile.NamedTemporaryFile() returns a file-like object which you're trying to provide as a command line argument. Instead, you want the actual filename which is available via its .name attribute - although I'm somewhat confused why you're creating a tempfile, writing to it, seeking back to position 0, then replacing your tempfile f object with the contents of the file? I suspect you don't want to do that replacement and use f.name for your query.
Result = subprocess.Popen(['blastn','-remote','-db','chromosome','-query',f.name,'-out',str(i)],stdout=subprocess.PIPE)
Also, there's some convenient wrapper functions around subprocess.Popen such as subprocess.check_output which are also somewhat more explicit as to your intent which could be used here instead.
Related
I have managed to get my first python script to work which downloads a list of .ZIP files from a URL and then proceeds to extract the ZIP files and writes them to disk.
I am now at a loss to achieve the next step.
My primary goal is to download and extract the zip file and pass the contents (CSV data) via a TCP stream. I would prefer not to actually write any of the zip or extracted files to disk if I could get away with it.
Here is my current script which works but unfortunately has to write the files to disk.
import urllib, urllister
import zipfile
import urllib2
import os
import time
import pickle
# check for extraction directories existence
if not os.path.isdir('downloaded'):
os.makedirs('downloaded')
if not os.path.isdir('extracted'):
os.makedirs('extracted')
# open logfile for downloaded data and save to local variable
if os.path.isfile('downloaded.pickle'):
downloadedLog = pickle.load(open('downloaded.pickle'))
else:
downloadedLog = {'key':'value'}
# remove entries older than 5 days (to maintain speed)
# path of zip files
zipFileURL = "http://www.thewebserver.com/that/contains/a/directory/of/zip/files"
# retrieve list of URLs from the webservers
usock = urllib.urlopen(zipFileURL)
parser = urllister.URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
# only parse urls
for url in parser.urls:
if "PUBLIC_P5MIN" in url:
# download the file
downloadURL = zipFileURL + url
outputFilename = "downloaded/" + url
# check if file already exists on disk
if url in downloadedLog or os.path.isfile(outputFilename):
print "Skipping " + downloadURL
continue
print "Downloading ",downloadURL
response = urllib2.urlopen(downloadURL)
zippedData = response.read()
# save data to disk
print "Saving to ",outputFilename
output = open(outputFilename,'wb')
output.write(zippedData)
output.close()
# extract the data
zfobj = zipfile.ZipFile(outputFilename)
for name in zfobj.namelist():
uncompressed = zfobj.read(name)
# save uncompressed data to disk
outputFilename = "extracted/" + name
print "Saving extracted file to ",outputFilename
output = open(outputFilename,'wb')
output.write(uncompressed)
output.close()
# send data via tcp stream
# file successfully downloaded and extracted store into local log and filesystem log
downloadedLog[url] = time.time();
pickle.dump(downloadedLog, open('downloaded.pickle', "wb" ))
Below is a code snippet I used to fetch zipped csv file, please have a look:
Python 2:
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
resp = urlopen("http://www.test.com/file.zip")
myzip = ZipFile(StringIO(resp.read()))
for line in myzip.open(file).readlines():
print line
Python 3:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
# or: requests.get(url).content
resp = urlopen("http://www.test.com/file.zip")
myzip = ZipFile(BytesIO(resp.read()))
for line in myzip.open(file).readlines():
print(line.decode('utf-8'))
Here file is a string. To get the actual string that you want to pass, you can use zipfile.namelist(). For instance,
resp = urlopen('http://mlg.ucd.ie/files/datasets/bbc.zip')
myzip = ZipFile(BytesIO(resp.read()))
myzip.namelist()
# ['bbc.classes', 'bbc.docs', 'bbc.mtx', 'bbc.terms']
My suggestion would be to use a StringIO object. They emulate files, but reside in memory. So you could do something like this:
# get_zip_data() gets a zip archive containing 'foo.txt', reading 'hey, foo'
import zipfile
from StringIO import StringIO
zipdata = StringIO()
zipdata.write(get_zip_data())
myzipfile = zipfile.ZipFile(zipdata)
foofile = myzipfile.open('foo.txt')
print foofile.read()
# output: "hey, foo"
Or more simply (apologies to Vishal):
myzipfile = zipfile.ZipFile(StringIO(get_zip_data()))
for name in myzipfile.namelist():
[ ... ]
In Python 3 use BytesIO instead of StringIO:
import zipfile
from io import BytesIO
filebytes = BytesIO(get_zip_data())
myzipfile = zipfile.ZipFile(filebytes)
for name in myzipfile.namelist():
[ ... ]
I'd like to offer an updated Python 3 version of Vishal's excellent answer, which was using Python 2, along with some explanation of the adaptations / changes, which may have been already mentioned.
from io import BytesIO
from zipfile import ZipFile
import urllib.request
url = urllib.request.urlopen("http://www.unece.org/fileadmin/DAM/cefact/locode/loc162txt.zip")
with ZipFile(BytesIO(url.read())) as my_zip_file:
for contained_file in my_zip_file.namelist():
# with open(("unzipped_and_read_" + contained_file + ".file"), "wb") as output:
for line in my_zip_file.open(contained_file).readlines():
print(line)
# output.write(line)
Necessary changes:
There's no StringIO module in Python 3 (it's been moved to io.StringIO). Instead, I use io.BytesIO]2, because we will be handling a bytestream -- Docs, also this thread.
urlopen:
"The legacy urllib.urlopen function from Python 2.6 and earlier has been discontinued; urllib.request.urlopen() corresponds to the old urllib2.urlopen.", Docs and this thread.
Note:
In Python 3, the printed output lines will look like so: b'some text'. This is expected, as they aren't strings - remember, we're reading a bytestream. Have a look at Dan04's excellent answer.
A few minor changes I made:
I use with ... as instead of zipfile = ... according to the Docs.
The script now uses .namelist() to cycle through all the files in the zip and print their contents.
I moved the creation of the ZipFile object into the with statement, although I'm not sure if that's better.
I added (and commented out) an option to write the bytestream to file (per file in the zip), in response to NumenorForLife's comment; it adds "unzipped_and_read_" to the beginning of the filename and a ".file" extension (I prefer not to use ".txt" for files with bytestrings). The indenting of the code will, of course, need to be adjusted if you want to use it.
Need to be careful here -- because we have a byte string, we use binary mode, so "wb"; I have a feeling that writing binary opens a can of worms anyway...
I am using an example file, the UN/LOCODE text archive:
What I didn't do:
NumenorForLife asked about saving the zip to disk. I'm not sure what he meant by it -- downloading the zip file? That's a different task; see Oleh Prypin's excellent answer.
Here's a way:
import urllib.request
import shutil
with urllib.request.urlopen("http://www.unece.org/fileadmin/DAM/cefact/locode/2015-2_UNLOCODE_SecretariatNotes.pdf") as response, open("downloaded_file.pdf", 'w') as out_file:
shutil.copyfileobj(response, out_file)
I'd like to add my Python3 answer for completeness:
from io import BytesIO
from zipfile import ZipFile
import requests
def get_zip(file_url):
url = requests.get(file_url)
zipfile = ZipFile(BytesIO(url.content))
files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
return files.pop() if len(files) == 1 else files
write to a temporary file which resides in RAM
it turns out the tempfile module ( http://docs.python.org/library/tempfile.html ) has just the thing:
tempfile.SpooledTemporaryFile([max_size=0[,
mode='w+b'[, bufsize=-1[, suffix=''[,
prefix='tmp'[, dir=None]]]]]])
This
function operates exactly as
TemporaryFile() does, except that data
is spooled in memory until the file
size exceeds max_size, or until the
file’s fileno() method is called, at
which point the contents are written
to disk and operation proceeds as with
TemporaryFile().
The resulting file has one additional
method, rollover(), which causes the
file to roll over to an on-disk file
regardless of its size.
The returned object is a file-like
object whose _file attribute is either
a StringIO object or a true file
object, depending on whether
rollover() has been called. This
file-like object can be used in a with
statement, just like a normal file.
New in version 2.6.
or if you're lazy and you have a tmpfs-mounted /tmp on Linux, you can just make a file there, but you have to delete it yourself and deal with naming
Adding on to the other answers using requests:
# download from web
import requests
url = 'http://mlg.ucd.ie/files/datasets/bbc.zip'
content = requests.get(url)
# unzip the content
from io import BytesIO
from zipfile import ZipFile
f = ZipFile(BytesIO(content.content))
print(f.namelist())
# outputs ['bbc.classes', 'bbc.docs', 'bbc.mtx', 'bbc.terms']
Use help(f) to get more functions details for e.g. extractall() which extracts the contents in zip file which later can be used with with open.
All of these answers appear too bulky and long. Use requests to shorten the code, e.g.:
import requests, zipfile, io
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("/path/to/directory")
Vishal's example, however great, confuses when it comes to the file name, and I do not see the merit of redefing 'zipfile'.
Here is my example that downloads a zip that contains some files, one of which is a csv file that I subsequently read into a pandas DataFrame:
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
import pandas
url = urlopen("https://www.federalreserve.gov/apps/mdrm/pdf/MDRM.zip")
zf = ZipFile(StringIO(url.read()))
for item in zf.namelist():
print("File in zip: "+ item)
# find the first matching csv file in the zip:
match = [s for s in zf.namelist() if ".csv" in s][0]
# the first line of the file contains a string - that line shall de ignored, hence skiprows
df = pandas.read_csv(zf.open(match), low_memory=False, skiprows=[0])
(Note, I use Python 2.7.13)
This is the exact solution that worked for me. I just tweaked it a little bit for Python 3 version by removing StringIO and adding IO library
Python 3 Version
from io import BytesIO
from zipfile import ZipFile
import pandas
import requests
url = "https://www.nseindia.com/content/indices/mcwb_jun19.zip"
content = requests.get(url)
zf = ZipFile(BytesIO(content.content))
for item in zf.namelist():
print("File in zip: "+ item)
# find the first matching csv file in the zip:
match = [s for s in zf.namelist() if ".csv" in s][0]
# the first line of the file contains a string - that line shall de ignored, hence skiprows
df = pandas.read_csv(zf.open(match), low_memory=False, skiprows=[0])
It wasn't obvious in Vishal's answer what the file name was supposed to be in cases where there is no file on disk. I've modified his answer to work without modification for most needs.
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
def unzip_string(zipped_string):
unzipped_string = ''
zipfile = ZipFile(StringIO(zipped_string))
for name in zipfile.namelist():
unzipped_string += zipfile.open(name).read()
return unzipped_string
Use the zipfile module. To extract a file from a URL, you'll need to wrap the result of a urlopen call in a BytesIO object. This is because the result of a web request returned by urlopen doesn't support seeking:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
zip_url = 'http://example.com/my_file.zip'
with urlopen(zip_url) as f:
with BytesIO(f.read()) as b, ZipFile(b) as myzipfile:
foofile = myzipfile.open('foo.txt')
print(foofile.read())
If you already have the file downloaded locally, you don't need BytesIO, just open it in binary mode and pass to ZipFile directly:
from zipfile import ZipFile
zip_filename = 'my_file.zip'
with open(zip_filename, 'rb') as f:
with ZipFile(f) as myzipfile:
foofile = myzipfile.open('foo.txt')
print(foofile.read().decode('utf-8'))
Again, note that you have to open the file in binary ('rb') mode, not as text or you'll get a zipfile.BadZipFile: File is not a zip file error.
It's good practice to use all these things as context managers with the with statement, so that they'll be closed properly.
Within my code I am attempting to import modules written by myself. Said modules interact with files on the system, either encrypting, decrypting or removing them altogether. When trying to import these modules I receive the below:
FileNotFoundError: [Errno 2] No such file or directory: <Directory>
Oddly this error seems to appear before anything local within the file runs (calls are at the end), making me believe python is parsing the files during the import phase and not find the files (apologies but I am novice when it comes to importing self created stuff)
Below is one of the methods I am importing (also the one flagging the error)
import urllib.request, re, os
def getKey(url, path):
urllib.request.urlretrieve(url, 'C:\\code\\'+path)
def readFiles(keyFile, targetFile, outfile):
infile = keyFile # read in the input file name
infd = open('C:\\code\\'+infile,"r") # open the file and create the file descriptor infd
key = infd.read( )
key = key.strip('\n')
#print('key= '+key)
infile = targetFile
infd = open('C:\\code\\'+infile, "r")
ptext = infd.read( ).strip('\n')
infd.close
xor(outfile, ptext, key)
def xor(outfile, ptext, key):
outfd = open('C:\\code\\'+outfile, "w")
pLength = len(ptext)
#get the length of the plaintext and cut the key into the same size
keyChunks = [key[i:i+pLength] for i in range(0, len(key), pLength)]
i = 0
while i < len(keyChunks):
a = int(ptext)
b = int(keyChunks[i])
out = a ^ b
i=i+1
outfd = open('C:\\code\\'+outfile, "w")
outfd.write(str(out))
outfd.close
def cleanup(targetFile):
os.remove(targetFile)
def enc():
outfile = 'affk.xor'
getKey('http://192.168.56.10/sym.key', 'sym.key')
readFiles('sym.key', 'affk.txt', outfile)
cleanup('C:/code/affk.txt')
def dec():
outfile = 'affk.txt'
getKey('http://192.168.56.10/sym.key', 'sym.key')
readFiles('sym.key', 'affk.xor', outfile)
cleanup('C:/code/affk.xor')
cleanup('C:/code/sym.key')
enc()
#dec()
Below is a snippet of my main file with the imports and calling function
import os, math, affine, re, urllib.request, ctypes
from xor import enc
from rsa1 import run
---Additional Code here---
def main():
# Call function to search for the files we want to encrypt
search()
# Call function to read the target files. Main ciphers are daisy chained off reader function
reader(targets)
# Call function to XOR Affine cipher key
xor.enc()
# Call function to encrypt key used for above XOR using RSA
rsa1.run()
Any assistance with this problem would be greatly appreciated! Apologies if this is an extremely dumb question!
In a main python file, I import another python files, say their names are file1, file2, file3 and all of them have a function inside them named scrape(). I am trying to choose which file's scrape() will run according to user input, like the following:
python main.py file1
Here is the relevant part of my code:
import file1
import file2
import file3
fileName = sys.argv[1]
for func in ['%s.scrape' % fileName]:
meta, infos = func()
However, I get this error message:
Traceback (most recent call last):
File "main.py", line 50, in <module>
meta, infos = func()
TypeError: 'str' object is not callable
Note that it works when I use for func in [file1.scrape]: I just can't use user input as the imported file name. Can someone tell me how to do it?
You are trying to call func as a function, when it's really a string you built from the command-line argument.
For your purposes, as also mentioned in prashant's linked post, you might want to use something like the imp module.
Here's a quick example
import sys
import imp
# `imp.load_source` requires the full path to the module
# This will load the module provided as `user_selection`
# You can then either `import user_selection`, or use the `mod` to access the package internals directly
mod = imp.load_source("user_selection", "/<mypath>/site-packages/pytz/__init__.py")
# I'm using `user_selection` and `mod` instead of `pytz`
import user_selection
print(user_selection.all_timezones)
print(mod.all_timezones)
In your case, you might have to use imp.find_module to get the full path from just the name, or provide the full paths directly in the command line.
This should be a starting point
import sys
import imp
file_name = sys.argv[1]
f, filename, desc = imp.find_module(file_name, ['/path/where/modules/live'])
mod = imp.load_module("selected_module", f, filename, desc)
mod.scrape()
I have some legacy code with a legacy function that takes a filename as an argument and processes the file contents. A working facsimile of the code is below.
What I want to do is not have to write to disk with some content that I generate in order to use this legacy function, so I though I could use StringIO to create an object in place of the physical filename. However, this does not work, as you can see below.
I thought StringIO was the way to go with this. Can anyone tell me if there is a way to use this legacy function and pass it something in the argument that isn't a file on disk but can be treated as such by the legacy function? The legacy function does have the with context manager doing work on the filename parameter value.
The one thing I came across in google was: http://bugs.python.org/issue1286, but that didn't help me...
Code
from pprint import pprint
import StringIO
# Legacy Function
def processFile(filename):
with open(filename, 'r') as fh:
return fh.readlines()
# This works
print 'This is the output of FileOnDisk.txt'
pprint(processFile('c:/temp/FileOnDisk.txt'))
print
# This fails
plink_data = StringIO.StringIO('StringIO data.')
print 'This is the error.'
pprint(processFile(plink_data))
Output
This is the output in FileOnDisk.txt:
['This file is on disk.\n']
This is the error:
Traceback (most recent call last):
File "C:\temp\test.py", line 20, in <module>
pprint(processFile(plink_data))
File "C:\temp\test.py", line 6, in processFile
with open(filename, 'r') as fh:
TypeError: coercing to Unicode: need string or buffer, instance found
A StringIO instance is an open file already. The open command, on the other hand, only takes filenames, to return an open file. A StringIO instance is not suitable as a filename.
Also, you don't need to close a StringIO instance, so there is no need to use it as a context manager either. While closing an instance frees the memory allocated, so does simply letting the garbage collector reap the object. At any rate, the contextlib.closing() context manager could take care of closing the object if you want to ensure freeing the memory while still holding a reference to the object.
If all your legacy code can take is a filename, then a StringIO instance is not the way to go. Use the tempfile module to generate a temporary filename instead.
Here is an example using a contextmanager to ensure the temp file is cleaned up afterwards:
import os
import tempfile
from contextlib import contextmanager
#contextmanager
def tempinput(data):
temp = tempfile.NamedTemporaryFile(delete=False)
temp.write(data)
temp.close()
try:
yield temp.name
finally:
os.unlink(temp.name)
with tempinput('Some data.\nSome more data.') as tempfilename:
processFile(tempfilename)
You can also switch to the newer Python 3 infrastructure offered by the io module (available in Python 2 and 3), where io.BytesIO is the more robust replacement for StringIO.StringIO / cStringIO.StringIO. This object does support being used as a context manager (but still can't be passed to open()).
you could define your own open function
fopen = open
def open(fname,mode):
if hasattr(fname,"readlines"): return fname
else: return fopen(fname,mode)
however with wants to call __exit__ after its done and StringIO does not have an exit method...
you could define a custom class to use with this open
class MyStringIO:
def __init__(self,txt):
self.text = txt
def readlines(self):
return self.text.splitlines()
def __exit__(self):
pass
This one is based on the python doc of contextmanager
It's just wrapping StringIO with simple context, and when exit is called, it will return to the yield point, and properly close the StringIO. This avoids the need of making tempfile, but with large string, this will still eat up the memory, since StringIO buffer that string.
It works well on most cases where you know the string data is not going to be long
from contextlib import contextmanager
#contextmanager
def buildStringIO(strData):
from cStringIO import StringIO
try:
fi = StringIO(strData)
yield fi
finally:
fi.close()
Then you can do:
with buildStringIO('foobar') as f:
print(f.read()) # will print 'foobar'
Even if
You can also switch to the newer Python 3 infrastructure offered by the io module (available in Python 2 and 3), where io.BytesIO is the more robust replacement for StringIO.StringIO / cStringIO.StringIO. This object does support being used as a context manager (but still can't be passed to open()
In Python3 , this works to me:
from pprint import pprint
from io import StringIO
import contextlib
#contextlib.contextmanager
def as_handle(handleish, mode="r", **kwargs):
try:
with open(handleish, mode, **kwargs) as fp:
yield fp
except TypeError:
yield handleish
def processFile(filename):
#with filename as fh: ### OK for StringIO
#with(open(filename)) as fh: #TypeError: expected str, bytes or os.PathLike #object, not _io.StringIO
with as_handle(filename) as fh:
return fh.readlines()
# This fails ## doesnt fail anymore
plink_data = StringIO('StringIO data.')
print('This is the error.')
pprint(processFile(plink_data))
output:
This is the error.
['StringIO data.']
I have a design question. I have a function loadImage() for loading an image file. Now it accepts a string which is a file path. But I also want to be able to load files which are not on physical disk, eg. generated procedurally. I could have it accept a string, but then how could it know the string is not a file path but file data? I could add an extra boolean argument to specify that, but that doesn't sound very clean. Any ideas?
It's something like this now:
def loadImage(filepath):
file = open(filepath, 'rb')
data = file.read()
# do stuff with data
The other version would be
def loadImage(data):
# do stuff with data
How to have this function accept both 'filepath' or 'data' and guess what it is?
You can change your loadImage function to expect an opened file-like object, such as:
def load_image(f):
data = file.read()
... and then have that called from two functions, one of which expects a path and the other a string that contains the data:
from StringIO import StringIO
def load_image_from_path(path):
with open(path, 'rb') as f:
load_image(f)
def load_image_from_string(s):
sio = StringIO(s)
try:
load_image(sio)
finally:
sio.close()
How about just creating two functions, loadImageFromString and loadImageFromFile?
This being Python, you can easily distinguish between a filename and a data string. I would do something like this:
import os.path as P
from StringIO import StringIO
def load_image(im):
fin = None
if P.isfile(im):
fin = open(im, 'rb')
else:
fin = StringIO(im)
# Read from fin like you would from any open file object
Other ways to do it would be a try block instead of using os.path, but the essence of the approach remains the same.