I am trying to create a file using luigi that will take a csv and create output the splits the data up into different dataframes based on whether or not a particular column in the csv contains a particular string
This is the file I created:
import luigi
import pandas as pd
import os
class read_file(luigi.Task):
fileName = luigi.Parameter()
def run(self):
full_file = pd.read_csv(self.fileName)
return full_file[['anonymous_id','channel','context_campaign_content',
'context_campaign_medium','context_campaign_name',
'context_campaign_source','context_campaign_term',
'timestamp','user_id','context_page_url',
'properties_url','properties_search','context_page_title',
'properties_path','context_user_agent','properties_referrer','rank']]
def output(self):
return full_file
class blog_readers(luigi.Task):
def run(self):
read_blog = read_file.full_file[read_file.full_file['properties_url'].str.contains('blog',regex=False)]
return read_blog
def requires(self):
return read_file
def output(self):
return read_blog
class logged_in(luigi.Task):
def run(self):
logged_in = read_file.full_file[read_file.full_file['properties_url'].str.contains('login',regex=False]
return logged_in
def requires(self):
return read_file
def output(self):
return logged_in
if __name__ == '__main__':
luigi.run()
However when I run this file on terminal-
python cleanup.py --local-scheduler read_blog --fileName '/Users/**/Desktop/file.csv'
I encounter this error message
File "cleancopy.py", line 64
logged_in = read_file.full_file[read_file.full_file['properties_url'].str.contains('login',regex=False]
^
SyntaxError: invalid syntax
I am not sure what the syntax error is being caused by
Related
i need to file name with extentions.
it's my Handler
class Handler(FileSystemEventHandler):
...
def on_modified(self, event):
if not event.is_directory:
origin_file_path = event.src_path
file_name = os.path.basename(event.src_path)
print(file_name)
return super().on_modified(event)
...
it's result like this
/usr/myproject/test.py
but, suddenly result return changed like this
/usr/myproject/.test.py.wcP5y9
how can i solved this problem?
I am using the python unittest module for testing a file that takes a command line argument. The argument is a file name which is then passed into a function like so:
file_name = str(sys.argv[1])
file = open(file_name)
result = main_loop(file)
print(result)
My test is set up like so:
class testMainFile(unittest.TestCase):
def test_main_loop(self):
file = open('file_name.json')
result = main_file.main_loop(file)
self.assertEqual(result, 'Expected Result')
if __name__ == 'main':
unittest.main()
When I run the test I get an "IndexError: list index out of range".
I tried passing the argument when running the test but to no avail. How do I run my test without error?
I think you have couple of options here. Firstly go to documentation and checkout patch because i think you can get away with
from unittest.mock import patch
#patch('sys.argv', ['mock.py', 'test-value'])
def test_main_loop(self):
Options for fun:
One would be simply to override the sys.argv next to your call
def test_main_loop(self):
file = open('file_name.json')
+ orginal_argv = sys.argv
+ sys.argv = ['mock argv', 'my-test-value']
result = main_file.main_loop(file)
+ sys.argv = orginal_argv
self.assertEqual(result, 'Expected Result')
Second would be to create a simple wrapper for your function
def set_sys_argv(func: Callable):
sys.argv = ['mock.py', 'my_test_value']
def wrapper(*args, **kwargs):
func()
return wrapper
and use it with test function
#set_sys_argv
def test_main_loop(self):
We can improve it slightly and make it more generic making a decorator that accepts the values to mock
def set_sys_argv(*argv):
sys.argv = argv
def _decorator(func: Callable):
def wrapper(*args, **kwargs):
func()
return wrapper
return _decorator
and use it similarly to patch
#set_sys_argv('mock.py', 'test-value')
def test_main_loop(self):
Third would be to create a context manager, likewise:
class ReplaceSysArgv(list):
def __enter__(self):
self._argv = sys.argv
sys.argv = ['mock', 'my-test-value']
return self
def __exit__(self, *args):
sys.argv = self._argv
and use it with your code
def test_main_loop(self):
file = open('file_name.json')
with ReplaceSysArgv():
result = main_file.main_loop(file)
self.assertEqual(result, 'Expected Result')
you have to push the arguments onto sys.argv before retrieving them (if your code is pulling from command-line arguments - it's unclear to me where in your test you're using the command-line arguments but I digress)
so something like first doing
import sys
sys.argv = ['mock_filename.py', 'json_file.json']
#... continue with rest of program / test.
I'll try to explain to you my issue.
I want to run different processes at the same time.
So, my goal is:
Download multiple file at the same time
When the zip is downloaded, i want to apply processing to all the files it contains using multiprocessing
The problem is that as soon as one file is being processed, all other process are killed.
Here's part of my code :
I.E : I create some process with main.py and reidentify.py
MAIN.PY
class NoDaemonProcess(multiprocessing.Process):
#property
def daemon(self):
return False
#daemon.setter
def daemon(self, value):
pass
class NoDaemonContext(type(multiprocessing.get_context())):
Process = NoDaemonProcess
class MyPool(multiprocessing.pool.Pool):
def __init__(self, *args, **kwargs):
kwargs['context'] = NoDaemonContext()
super(MyPool, self).__init__(*args, **kwargs)
def parse_args():
"""Parse arguments"""
# Some code
def execute(args, row):
"""Execute function"""
try:
upload_file = #some code
# Download study
download_ = download.Download(
upload_file,
# Some code
)
# Re-identification
reidentify_ = reidentify.Reidentify(
upload_file,
)
reidentify_.check_if_dicom_is_reidentify()
download_.download_study()
reidentify_.reidentify()
except Exception as error:
raise
def main():
"""Main function"""
try:
args = parse_args()
csv_file = pd.read_csv(args.fpath_csv)
msg = 'Some required CSV columns are missing'
assert set(csv_file.columns) >= required_columns, msg
# Create a multiprocessing Pool
with MyPool() as pool:
func = partial(execute, args)
pool.map(func, zip(csv_file.iterrows()))
LOGGER.info("Completed")
except Exception as error:
LOGGER.error(error)
sys.exit(1)
REIDENTIFY.PY
class Reidentify:
"""Re-identify class"""
def processing_data(self, archive_, file):
"""Processing data"""
# Edit Dicom general fields
def reidentify(self):
"""Re-identify dicom and save to filepath_output"""
try:
# Re-identification of the Dicom
archive_ = archive.Archive(self.download.filepath_download)
with multiprocessing.Pool() as pool:
func = partial(self.processing_data, archive_)
pool.map(func, archive_.list_archive_files())
LOGGER.info(f"{self.filepath_output} is re-identified")
except DicomAlreadyReidentified as error:
LOGGER.info(error)
return
except Exception:
raise
I am having an issue closing excel after using Dispatch.
import openpyxl
import os
from win32com import client
class CTAutomation:
def __init__(self, file):
self.invoice = xl.load_workbook(os.getcwd() + "\Templates\ctrates.xlsx")
self.xlTemplate = xl.load_workbook(os.getcwd() + "\Templates\invoiceTemplate.xlsx")
self.vpc = xl.load_workbook(os.getcwd() + "\Templates\Vpc.xlsx")
self.file = file
def invoice_make(self):
self.xlApp = client.Dispatch("Excel.Application")
self.xlbook = self.xlApp.Workbooks.Open(os.getcwd() + '\TestFiles\\' + self.file)
self.ws = self.xlbook.Worksheets[0]
self.ws.Visible = 1
self.ws.ExportAsFixedFormat(0, os.getcwd() + "\complitedpdf\\" + self.file + ".pdf")
self.quit()
def quit(self):
self.xlbook.Close()
self.xlApp.Quit()
def xlformater(self):
return None
def main():
pwd = os.listdir(os.getcwd() + "\TestFiles")
for file in pwd:
CTAutomation(file.strip(".xlsx")).invoice_make()
if __name__ == "__main__":
main()
all works well till this part. i have found a few posts about this topic in the forum but i feel that im still missing something to close the app,
.xlsx and xls(Latest Versions) to pdf using python in example
some advice would be much appreciated .
Essentially it is your class object persisting in memory. Consider wrapping the process in a context manager using with(). And call the invoice_make() within the context.
Additionally, you had an incorrect Excel method by indexing workbook by zero with square brackets.
Finally, consider using os.path.join() to aviod back or forward slashes and use a try/except block to catch COM exceptions and properly release objects from memory.
import openpyxl as xl
import os
from win32com import client
cwd = os.getcwd()
class CTAutomation:
def __init__(self):
self.invoice = xl.load_workbook(os.path.join(cwd, "Templates", "ctrates.xlsx"))
self.xlTemplate = xl.load_workbook(os.path.join(cwd, "Templates", "invoiceTemplate.xlsx"))
self.vpc = xl.load_workbook(os.path.join(cwd, "Templates", "Vpc.xlsx"))
def invoice_make(self, file):
try:
self.xlApp = client.Dispatch("Excel.Application")
self.xlbook = self.xlApp.Workbooks.Open(os.path.join(cwd, "TestFiles", file))
self.ws = self.xlbook.Worksheets(1) # USE PARENTHESES (NOT BRACKETS AND NON-ZERO INDEX)
#self.ws.Visible = 1 # KEEP PROCESS IN BACKGROUND
self.ws.ExportAsFixedFormat(0, os.path.join(cwd, "complitedpdf", file.replace(".xlsx",".pdf")))
self.xlbook.Close(False)
self.xlApp.Quit()
except Exception as e:
print(e)
finally:
self.ws = None # RELEASE EXCEL OBJS FROM MEMORY
self.xlbook = None
self.xlApp = None
def xlformater(self):
return None
def __enter__(self):
return self # BOUND TO as IN with()
def __exit__(self, *err):
return None
def main():
pwd = os.listdir(os.path.join(cwd, "TestFiles"))
with CTAutomation() as obj: # CONTEXT MANAGER
for file in pwd:
print(file)
obj.invoice_make(file)
if __name__ == "__main__":
main()
I am trying to implement an FTP server using twisted that limits the size of the uploaded file. Ideally this would happen before the transfer starts, but it is not really a problem if it exits gracefully during the transfer if it is too large.
I have started from the very basic ftpserver.py and slowly been pulling in more of the underlying classes from ftp.py to get down to the innards.
Current code below, please excuse the 'hack-and-slash' style employed until I can get it working.
#!/usr/bin/python
import os
from twisted.protocols.ftp import FTPFactory, FTPShell, FTPAnonymousShell, IFTPShell
from twisted.cred.portal import Portal
from twisted.cred.checkers import AllowAnonymousAccess
from twisted.internet import reactor, defer
from twisted.python import filepath, failure
class FileConsumer1(object):
def __init__(self, fObj):
self.fObj = fObj
def registerProducer(self, producer, streaming):
self.producer = producer
assert streaming
def unregisterProducer(self):
self.producer = None
self.fObj.close()
def write(self, bytes):
size = os.fstat(self.fObj.fileno()).st_size + len(bytes)
if size > 10:
raise Exception("File too large") # WHAT GOES HERE?
self.fObj.write(bytes)
class FileWriter1(object):
def __init__(self, fObj):
self.fObj = fObj
self._receive = False
def receive(self):
assert not self._receive, "Can only call IWriteFile.receive *once* per instance"
self._receive = True
return defer.succeed(FileConsumer1(self.fObj))
def close(self):
return defer.succeed(None)
class FTPShell1(FTPShell):
def openForWriting(self, path):
p = self._path(path)
if p.isdir():
return defer.fail(IsADirectoryError(path))
try:
fObj = p.open('w')
except (IOError, OSError), e:
return errnoToFailure(e.errno, path)
except:
return defer.fail()
return defer.succeed(FileWriter1(fObj))
class FTPRealm1(object):
def __init__(self, root):
self.path = filepath.FilePath(root)
def requestAvatar(self, avatarId, mind, *interfaces):
avatar = FTPShell1(self.path)
return (IFTPShell, avatar, getattr(avatar, 'logout', lambda: None))
p = Portal(FTPRealm1('./'), [ AllowAnonymousAccess() ])
f = FTPFactory(p)
reactor.listenTCP(4021, f)
reactor.run()
clearly the check if size > 10 will be bigger, but how should a be indicating there's a problem at this point? As it stands, twisted catches that exception, but it's not very elegant. As far as I can see from examination of ftp.py there's nothing obvious I can return here. Can I pass down a deferred in some way? How should I be closing down the transfer elegantly?
Thanks,
Here's a revised version
#!/usr/bin/python
import os
from zope.interface import Interface, implements
from twisted.protocols.ftp import FTPFactory, FTPShell, FTPAnonymousShell, IFTPShell, IWriteFile , BaseFTPRealm, FTPCmdError, EXCEEDED_STORAGE_ALLOC
from twisted.cred.portal import Portal
from twisted.cred.checkers import AllowAnonymousAccess
from twisted.internet import reactor, defer, interfaces
from twisted.python import filepath
class ExceededStorageAllocError(FTPCmdError):
errorCode = EXCEEDED_STORAGE_ALLOC
class FileConsumer(object):
implements(interfaces.IConsumer)
def __init__(self):
self.data = ""
self.error = None
def registerProducer(self, producer, streaming):
self.producer = producer
assert streaming
def unregisterProducer(self):
if self.producer:
self.producer.stopProducing()
self.producer = None
def write(self, bytes):
self.data += bytes
if len(self.data) > 10:
self.unregisterProducer()
self.error = ExceededStorageAllocError()
class FileWriter(object):
implements(IWriteFile)
def __init__(self, path):
self.path = path
def receive(self):
self.consumer = FileConsumer()
return defer.succeed(self.consumer)
def close(self):
if self.consumer.error:
return defer.fail(self.consumer.error)
try:
f = self.path.open('w')
except (IOError, OSError), e:
return errnoToFailure(e.errno, path)
f.write(self.consumer.data)
return defer.succeed(None)
class FTPShell1(FTPShell):
makeDirectory = FTPAnonymousShell.makeDirectory
removeDirectory = FTPAnonymousShell.removeDirectory
def openForWriting(self, path):
p = self._path(path)
if p.isdir():
return defer.fail(IsADirectoryError(path))
return defer.succeed(FileWriter(p))
class FTPRealm1(BaseFTPRealm):
def __init__(self, root):
self.root = root
def requestAvatar(self, avatarId, mind, *interfaces):
avatar = FTPShell1(filepath.FilePath(self.root))
return (IFTPShell, avatar, getattr(avatar, 'logout', lambda: None))
p = Portal(FTPRealm1('./'), [ AllowAnonymousAccess() ])
f = FTPFactory(p)
reactor.listenTCP(4021, f)
reactor.run()
which accumulates the received data within the FileConsumer() then aborts if the file is too long. the close() method of the FileWriter() then either reports that error or writes the complete buffer to the file.
The only real issue I'm having with this is that when run, the exception is displayed on the server:
Unexpected error received during transfer:
Traceback (most recent call last):
Failure: __main__.ExceededStorageAllocError:
As a quick disclaimer, I'm very bad with Twisted's producer/consumer model, so this may not work. As always, I'm not responsible if things blow up ;)
You seem to be on the correct path so pat yourself on the back for that. I think if you call unregisterProducer when a file is too large, the file should stop consuming. You may also need to call self.producer.stopProducing(), but don't quote me on that.
def unregisterProducer(self):
self.producer.stopProducing()
self.fObj.close()
def write(self, bytes):
size = os.fstat(self.fObj.fileno()).st_size + len(bytes)
if size > 10:
self.unregisterConsumer()
# log statements would go here
# do some clean up too
self.fObj.write(bytes)
If my mental code Python interpreter is correct, this should simply just stop consuming the file. As far as what you should return to the client, you're going to have to read the RFC about FTP to figure that out.
PS
As tedious as it may seem, please use the #implementor decorators. Most times you'll be fine, but there may be instances where unexpected trace backs appear.