I have 500 links to download and want to batch them by for example 10 items.
How this pseudo code would be like?
class BatchJobTask(luigi.Task)
items = luigi.Parameter()
def run(self):
listURLs = []
with ('urls_chunk', 'r') as urls
for line in urls:
listURLs.append('http://ggg'+line+'.org')
10_urls = listURLs[0:items] #10 items here
for i in 10_urls:
req = request.get(url)
req.contents
def output(self):
return self.LocalTarger("downloaded_filelist.txt")
class BatchWorker(luigi.Task)
def run(self)
# Here I should run BatchJobTask from 0 to 10, next 11 - 21 new etc...
How it would be?
Here is an approach to doing something like what you want, but with the list of strings stored as separate lines in a file.
import luigi
import requests
BATCH_SIZE = 10
class BatchProcessor(luigi.Task):
items = luigi.ListParameter()
max = luigi.IntParameter()
def requires(self):
return None
def output(self):
return luigi.LocalTarget('processed'+str(max)+'.txt')
def run(self):
for item in self.items:
req = requests.get('http://www.'+item+'.org')
# do something useful here
req.contents
open("processed"+str(max)+".txt",'w').close()
class BatchCreator(luigi.Task):
file_with_urls = luigi.Parameter()
def requires(self):
required_tasks = []
f = open(self.file_with_urls)
batch_index = 0
total_index = 0
lines = []
while True:
line = f.readline()
if not line: break
total_index += 1
if batch_index < BATCH_SIZE:
lines.append(line)
batch_index += 1
else:
required_tasks.append(BatchProcessor(batch=lines))
lines = [line]
batch_index = 1
return required_tasks
def output(self):
return luigi.LocalTarget(str(self.file_with_urls) + 'processed')
def run(self):
open(str(self.file_with_urls) + 'processed', 'w').close()
I did this.
class GetListtask(luigi.Task)
def run(self):
...
def output(self):
return luigi.LocalTarget(self.outputfile)
class GetJustOneFile(luigi.Task):
fid = luigi.IntParameter()
def requires(self):
pass
def run(self):
url = 'http://my-server.com/test' + str(self.fid) + '.txt'
download_file = requests.get(url, stream=True)
with self.output().open('w') as downloaded_file:
downloaded_file.write(str(download_file.content))
def output(self):
return luigi.LocalTarget("test{}.txt".format(self.fid))
class GetAllFiles(luigi.WrapperTask):
def requires(self):
listoffiles = [] # 0..999
for i in range(899):
listoffiles.append(i)
return [GetJustOneFile(fid=fileid) for fileid in listoffiles]
Does this code awful?
Related
I'm trying to implement the function for reading CSV files in class.
class input_data:
path = ''
start = 0
total = 0
datas = []
labels = []
def __init__(self,p):
self.path = p
def read_csv(self):
print(self.path)
print("hello")
path1='../generate_data/train_data/train_data.csv'
test=input_data('../generate_data/train_data/train_data.csv')
test.read_csv
The above code compiles normally, but there is no output when executed.
What is the reason for this?
You have to call a method: read_csv():
Don't use class attributes as instance attributes.
class InputData:
def __init__(self, path):
self.path = path
self.start = 0
self.total = 0
self.datas = []
self.labels = []
def read_csv(self):
print(self.path)
print("hello")
path1 = '../generate_data/train_data/train_data.csv'
test = InputData(path1)
test.read_csv()
put () at the and of the method calling...
class input_data:
path = ''
start = 0
total = 0
datas = []
labels = []
def __init__(self,p):
self.path = p
def read_csv(self):
print(self.path)
print("hello")
path1='../generate_data/train_data/train_data.csv'
test=input_data(path1)
test.read_csv()
Is there a sure-fire way to check that the class of an object is a sub-class of the desired super?
For Example, in a migration script that I'm writing, I have to convert objects of a given type to dictionaries in a given manner to ensure two-way compatability of the data.
This is best summed up like so:
Serializable
User
Status
Issue
Test
Set
Step
Cycle
However, when I'm recursively checking objects after depickling, I receive a Test object that yields the following results:
Testing data object type:
type(data)
{type}< class'__main.Test' >
Testing Class type:
type(Test())
{type}< class'__main.Test' >
Testing object type against class type:
type(Test()) == type(data)
{bool}False
Testing if object isinstance() of Class:
isinstance(data, Test)
{bool}False
Testing if Class isinstance() of Super Class:
isinstance(Test(), Serializable)
{bool}True
Testing isinstance() of Super Class::
isinstance(data, Serializable)
{bool}False
Interestingly, it doesn't appear to have any such problem prior to pickling as it handles the creation of dictionary and integrity hash just fine.
This only crops up with depickled objects in both Pickle and Dill.
For Context, here's the code in it's native environment - the DataCache object that is pickled:
class DataCache(object):
_hash=""
_data = None
#staticmethod
def genHash(data):
dataDict = DataCache.dictify(data)
datahash = json.dumps(dataDict, sort_keys=True)
return hashlib.sha256(datahash).digest()
#staticmethod
def dictify(data):
if isinstance(data,list):
datahash = []
for item in data:
datahash.append(DataCache.dictify(item))
elif isinstance(data,(dict, collections.OrderedDict)):
datahash = collections.OrderedDict()
for key,value in datahash.iteritems():
datahash[key]= DataCache.dictify(value)
elif isinstance(data, Serializable):
datahash = data.toDict()
else:
datahash = data
return datahash
def __init__(self, restoreDict = {}):
if restoreDict:
self.__dict__.update(restoreDict)
def __getinitargs__(self):
return (self.__dict__)
def set(self, data):
self._hash = DataCache.genHash(data)
self._data = data
def verify(self):
dataHash = DataCache.genHash(self._data)
return (self._hash == dataHash)
def get(self):
return self._data
Finally, I know there's arguments for using JSON for readability in storage, I needed Pickle's ability to convert straight to and from Objects without specifying the object type myself. (thanks to the nesting, it's not really feasible)
Am I going mad here or does pickling do something to the class definitions?
EDIT:
Minimal Implementation:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
from aenum import Enum
import json # _tricks
import base64
import argparse
import os
import sys
import datetime
import dill
import hashlib
import collections
class Serializable(object):
def __init__(self, initDict={}):
if initDict:
self.__dict__.update(initDict)
def __str__(self):
return str(self.sortSelf())
def sortSelf(self):
return collections.OrderedDict(sorted(self.__dict__.items()))
def toDict(self):
return self.__dict__
def fromDict(self, dict):
# Not using __dict__.update(...) to avoid polluting objects with the excess data
varMap = self.__dict__
if dict and varMap:
for key in varMap:
if (key in dict):
varMap[key] = dict[key]
self.__dict__.update(varMap)
return self
return None
class Issue(Serializable):
def __init__(self, initDict={}):
self.id = 0
self.key = ""
self.fields = {}
if initDict:
self.__dict__.update(initDict)
Serializable.__init__(self)
def fieldToDict(self, obj, key, type):
if key in obj:
result = obj[key]
else:
return None
if result is None:
return None
if isinstance(result, type):
return result.toDict()
return result
def fromDict(self, jsonDict):
super(Issue, self).fromDict(jsonDict)
self.fields["issuetype"] = IssueType().fromDict(self.fields["issuetype"])
self.fields["assignee"] = User().fromDict(self.fields["assignee"])
self.fields["creator"] = User().fromDict(self.fields["creator"])
self.fields["reporter"] = User().fromDict(self.fields["reporter"])
return self
def toDict(self):
result = super(Issue, self).toDict()
blankKeys = []
for fieldName, fieldValue in self.fields.iteritems():
if fieldValue is None:
blankKeys.append(fieldName)
if blankKeys:
for key in blankKeys:
self.fields.pop(key, None)
result["fields"]["issuetype"] = self.fieldToDict(result["fields"], "issuetype", IssueType)
result["fields"]["creator"] = self.fieldToDict(result["fields"], "creator", User)
result["fields"]["reporter"] = self.fieldToDict(result["fields"], "reporter", User)
result["fields"]["assignee"] = self.fieldToDict(result["fields"], "assignee", User)
return result
class IssueType(Serializable):
def __init__(self):
self.id = 0
self.name = ""
def toDict(self):
return {"id": str(self.id)}
class Project(Serializable):
def __init__(self):
Serializable.__init__(self)
self.id = 0
self.name = ""
self.key = ""
class Cycle(Serializable):
def __init__(self):
self.id = 0
self.name = ""
self.totalExecutions = 0
self.endDate = ""
self.description = ""
self.totalExecuted = 0
self.started = ""
self.versionName = ""
self.projectKey = ""
self.versionId = 0
self.environment = ""
self.totalCycleExecutions = 0
self.build = ""
self.ended = ""
self.name = ""
self.modifiedBy = ""
self.projectId = 0
self.startDate = ""
self.executionSummaries = {'executionSummary': []}
class Step(Serializable):
def __init__(self):
self.id = ""
self.orderId = 0
self.step = ""
self.data = ""
self.result = ""
self.attachmentsMap = {}
def toDict(self):
dict = {}
dict["step"] = self.step
dict["data"] = self.data
dict["result"] = self.result
dict["attachments"] = []
return dict
class Status(Serializable):
def __init__(self):
self.id = 0
self.name = ""
self.description = ""
self.isFinal = True
self.color = ""
self.isNative = True
self.statusCount = 0
self.statusPercent = 0.0
class User(Serializable):
def __init__(self):
self.displayName = ""
self.name = ""
self.emailAddress = ""
self.key = ""
self.active = False
self.timeZone = ""
class Execution(Serializable):
def __init__(self):
self.id = 0
self.orderId = 0
self.cycleId = -1
self.cycleName = ""
self.issueId = 0
self.issueKey = 0
self.projectKey = ""
self.comment = ""
self.versionId = 0,
self.versionName = "",
self.executedOn = ""
self.creationDate = ""
self.executedByUserName = ""
self.assigneeUserName = ""
self.status = {}
self.executionStatus = ""
def fromDict(self, jsonDict):
super(Execution, self).fromDict(jsonDict)
self.status = Status().fromDict(self.status)
# This is already listed as Execution Status, need to associate and convert!
return self
def toDict(self):
result = super(Execution, self).toDict()
result['status'] = result['status'].toDict()
return result
class ExecutionContainer(Serializable):
def __init__(self):
self.executions = []
def fromDict(self, jsonDict):
super(ExecutionContainer, self).fromDict(jsonDict)
self.executions = []
for executionDict in jsonDict["executions"]:
self.executions.append(Execution().fromDict(executionDict))
return self
class Test(Issue):
def __init__(self, initDict={}):
if initDict:
self.__dict__.update(initDict)
Issue.__init__(self)
def toDict(self):
result = super(Test, self).toDict()
stepField = "CustomField_0001"
if result["fields"][stepField]:
steps = []
for step in result["fields"][stepField]["steps"]:
steps.append(step.toDict())
result["fields"][stepField] = steps
return result
def fromDict(self, jsonDict):
super(Test, self).fromDict(jsonDict)
stepField = "CustomField_0001"
steps = []
if stepField in self.fields:
for step in self.fields[stepField]["steps"]:
steps.append(Step().fromDict(step))
self.fields[stepField] = {"steps": steps}
return self
class Set(Issue):
def __init__(self, initDict={}):
self.__dict__.update(initDict)
Issue.__init__(self)
class DataCache(object):
_hash = ""
_data = None
#staticmethod
def genHash(data):
dataDict = DataCache.dictify(data)
datahash = json.dumps(dataDict, sort_keys=True)
return hashlib.sha256(datahash).digest()
#staticmethod
def dictify(data):
if isinstance(data, list):
datahash = []
for item in data:
datahash.append(DataCache.dictify(item))
elif isinstance(data, (dict, collections.OrderedDict)):
datahash = collections.OrderedDict()
for key, value in datahash.iteritems():
datahash[key] = DataCache.dictify(value)
elif isinstance(data, Serializable):
datahash = data.toDict()
else:
datahash = data
return datahash
def __init__(self, restoreDict={}):
if restoreDict:
self.__dict__.update(restoreDict)
def __getinitargs__(self):
return (self.__dict__)
def set(self, data):
self._hash = DataCache.genHash(data)
self._data = data
def verify(self):
dataHash = DataCache.genHash(self._data)
return (self._hash == dataHash)
def get(self):
return self._data
def saveCache(name, projectKey, object):
filePath = "migration_caches/{projectKey}".format(projectKey=projectKey)
if not os.path.exists(path=filePath):
os.makedirs(filePath)
cache = DataCache()
cache.set(object)
targetFile = open("{path}/{name}".format(name=name, path=filePath), 'wb')
dill.dump(obj=cache, file=targetFile)
targetFile.close()
def loadCache(name, projectKey):
filePath = "migration_caches/{projectKey}/{name}".format(name=name, projectKey=projectKey)
result = False
try:
targetFile = open(filePath, 'rb')
try:
cache = dill.load(targetFile)
if isinstance(cache, DataCache):
if cache.verify():
result = cache.get()
except EOFError:
# except BaseException:
print ("Failed to load cache from file: {filePath}\n".format(filePath=filePath))
except IOError:
("Failed to load cache file at: {filePath}\n".format(filePath=filePath))
targetFile.close()
return result
testIssue = Test().fromDict({"id": 1000,
"key": "TEST",
"fields": {
"issuetype": {
"id": 1,
"name": "TestIssue"
},
"assignee": "Minothor",
"reporter": "Minothor",
"creator": "Minothor",
}
})
saveCache("Test", "TestProj", testIssue)
result = loadCache("Test", "TestProj")
EDIT 2
The script in it's current form, now seems to work correctly with vanilla Pickle, (initially switched to Dill due to a similar issue, which was solved by the switch).
However, if you are here with this issue and require Dill's features, then as Mike noted in the comments - it's possible to change the settings in dill.settings to have Dill behave pickle referenced items only with joblib mode, effectively mirroring pickle's standard pickling behaviour.
I have created a class containing all its instances and need to parallelize the process of instantiation, but cannot solve the problem of sharing the class as a class object. Is it possible in python 2.7 using multiprocessing?
OUTPUT_HEADINGS = []
class MyContainer(object):
"""
"""
instances = []
children = []
#classmethod
def export_to_csv(cls):
with open(args.output, "w") as output_file:
f_csv = csv.DictWriter(output_file, fieldnames=OUTPUT_HEADINGS)
f_csv.writeheader()
for instance in cls.instances:
f_csv.writerow(instance.to_dict())
def __new__(cls, dat_file):
try:
tree = ElementTree.parse(dat_file)
cls.children = tree.findall("parent_element/child_element")
except ElementTree.ParseError as err:
logging.exception(err)
if not cls.children:
msg = ("{}: No \"parent_element/child_element\""
" element found".format(os.path.basename(dat_file)))
logging.warning(msg)
cls.children = []
return False
else:
instance = super(MyContainer, cls).__new__(cls, dat_file)
instance.__init__(dat_file)
cls.instances.append(instance)
cls.children = []
return True
def __init__(self, dat_file):
self._name = os.path.basename(dat_file)
self.attr_value_sum = defaultdict(list)
var1 = MyContainer.children[0].find("var1")
var2 = MyContainer.children[0].get("var2")
cat_name = "{}.{}".format(var1, var2)
if cat_name not in OUTPUT_HEADINGS:
OUTPUT_HEADINGS.append(cat_name)
# processing and summarizing of xml data
def to_dict(self):
return output_dict
def main():
i = 0
try:
for f in FILE_LIST:
i += 1
print "{}/{}: {} in progress...".format(i, len(FILE_LIST), f)
print "{}".format("...DONE" if MyContainer(f) else "...SKIPPED")
except Exception as err:
logging.exception(err)
finally:
MyContainer.export_to_csv()
if __name__ == '__main__':
FILE_LIST = []
for d in args.dirs:
FILE_LIST.extend(get_name_defined_files(dir_path=d,
pattern=args.filename,
recursive=args.recursive))
main()
I tried to use multiprocessing.managers.BaseManager, to create a proxy for MyContainer class, but it can only create an instance object this way. I want actually to parallelize the MyContainer(dat_file) call.
I'm attempting to create a source for icecast2/shoutcast. But after compiling everything I've run into a segmentation error. After further debugging with gdb I was given a more detailed error.
I do not know c of any kind so I'm not sure what to make of this error
Program received signal SIGSEGV, Segmentation fault.
send_mp3 (self=0x988eb0, buff = 0xa5c154 "" at mp3.c:175175 mp3.c: No such file or directory.
I thought maybe it was the loop using to much resources. But no matter how much I set time.sleep() I still got the same result.
import random
import shout
from pyModules import db
from pyModules import error
import ID3
import time
import sys
import glob
class Audio(object):
def __init__(self):
self.count = 0
self.nbuf = 0
self.buf = 0
self.shout = shout.Shout()
self.db = db.database()
self.songs = self.load()
def load(self):
return glob.glob("%s*%s" % (self.config('directory'), self.config('ext')))
def start(self):
self.running = True
self.shout.host = self.config('host')
self.shout.port = self.config('port')
self.shout.mount = self.config('mount')
self.shout.protocol = self.config('protocol')
self.shout.user = self.config('user')
self.shout.password = self.config('password')
self.shout.name = self.config('name')
self.shout.format = self.config('format')
self.shout.genre = self.config('genre')
self.shout.url = self.config('url')
self.shout.public = self.config('public')
self.shout.description = self.config('description')
self.songs = self.load()
self.shout.open()
def cShuffle(self):
sh = self.getSettings(1, key='shuffle')
if sh == 1:
random.shuffle(self.songs)
def cNext(self):
n = self.getSettings(1, key='setSong')
if n == 1:
self.stop()
self.db.setNext(0)
self.Change()
def cPrev(self):
p = self.getSettings(1, key='prevSong')
if p == 1:
self.stop()
if self.count == 0:
self.count -= 1
self.db.setPrev(0)
self.Change()
else:
self.count -= 2
self.Change()
def cReload(self):
r = self.getSettings(1, key='reload')
if r == 1:
self.songs = self.load()
def check(self):
self.cShuffle()
self.cNext()
self.cPrev()
self.cReload()
def getSettings(self, mode=0, key=None):
return self.db.getSettings(mode, key)
def config(self, value):
return self.db.config(value)
def getTitle(self, File, mode=0):
try:
song = ID3.ID3(File)
title = song["TITLE"]
except:
title = "unknown"
title = title.replace("'", "")
if mode == 0:
self.db.setSongTitle(title)
return title
elif mode == 1:
self.db.setNextSongTitle(title)
return title
elif mode == 2:
self.db.setPrevSongTitle(title)
def sendBlankFile(self):
File = open('/home/radio/AudioServer/bin/blank.mp3').read()
self.shout.send(File)
def stop(self):
self.buf = 0
self.nbuf = 0
self.running = 0
self.sendBlankFile()
def Change(self):
self.stop()
if len(self.songs) >= self.count: self.count = 0
else: self.count += 1
song = self.songs[self.count]
psong = self.songs[self.count - 1]
nsong = self.songs[self.count + 1]
self.getTitle(song, mode=0)
self.getTitle(nsong, mode=1)
self.getTitle(psong, mode=2)
self.play()
def play(self):
song = open(self.songs[self.count])
cs = self.songs[self.count]
self.shout.set_metadata({'song': self.getTitle(cs)})
total = 0
st = time.time()
self.nbuf = song.read(4096)
while self.running:
self.check()
self.buf = self.nbuf
self.nbuf = song.read(4096)
self.buf = self.nbuf
total = total + len(self.buf)
if len(self.buf) == 0:
self.running = False
self.Change()
self.shout.send(self.buf)
self.shout.sync()
if __name__ == "__main__":
Server = Audio()
default = Server.config('default')
Server.db.clear(default)
Server.start()
The issue was indeed a compile problem of libshout as cox pointed out. But it only worked in debian 7 and not ubuntu 12. I think the reason why is because I did not install libogg in ubuntu I only installed vorbis which I thought was the same thing. I also installed mp3 codecs just in case.
I have a program that uses a number of global variables and I was hoping to write a few unit tests for some of the methods in the program.
I was new to python when I started writing the code, and realise now that I should have been testing all along. Some of the methods in the program are as follows:
class Wordnet():
def __init__(self):
self.graph = Graph()
self.before_at = ''
self.after_at = ''
self.word_part = ''
self.gloss_part = ''
self.lex_filenum = ''
def process_file(self):
self.file = open("testing_line.txt", "r")
return self.file
def line_for_loop(self, file):
for line in file:
self.split_pointer_part(line)
self.split_word_part(line)
self.split_gloss_part(line)
self.process_lex_filenum(self.word_part)
def split_pointer_part(self, line):
self.before_at, self.after_at = line.split('#', 1)
return self.before_at, self.after_at
def split_word_part(self, line):
self.word_part = line.split()
return self.word_part
def split_gloss_part(self, line):
self.gloss_part = line.strip().split('|')
return self.gloss_part
def process_lex_filenum(self, word_part):
self.lex_filenum = word_part[1]
return self.lex_filenum
if __name__ == '__main__':
wordnet = Wordnet()
my_file = wordnet.process_file()
wordnet.line_for_loop(my_file)
What is confusing me is how to variables are passing through to the test class and how I go about writing the testing methods. So far this is what I have:
class WordnetTestCase(unittest.TestCase):
def setUp(self):
self.wn = wordnet.Wordnet()
self.graph = wordnet.Graph()
self.before_at = wordnet.before_at
self.after_at = wordnet.after_at
self.word_part = wordnet.word_part
self.gloss_part = wordnet.gloss_part
self.lex_filenum = wordnet.lex_filenum
def test_split_pointer_part(line):
expected = '13797906 23 n 04 flood 0 inundation 0 deluge 0 torrent 0 005',' 13796604 n 0000 + 00603894 a 0401 + 00753137 v 0302 + 01527311 v 0203 + 02361703 v 0101 | an overwhelming number or amount; "a flood of requests"; "a torrent of abuse"'
real = self.wn.split_pointer_part()
self.assertEqual(real, expected)
if __name__ == '__main__':
unittest.main()
raw_input("Press <ENTER> to exit")
This does not work at the moment and I know I am not doing it the right way but just can't find any specific help for this problem!
Here is a runnable example to get you started:
import unittest
class Wordnet():
def __init__(self):
# self.graph = Graph()
self.before_at = ''
self.after_at = ''
self.word_part = ''
self.gloss_part = ''
self.lex_filenum = ''
def process_file(self):
self.file = open("testing_line.txt", "r")
return self.file
def line_for_loop(self, file):
for line in file:
self.split_pointer_part(line)
self.split_word_part(line)
self.split_gloss_part(line)
self.process_lex_filenum(self.word_part)
def split_pointer_part(self, line):
self.before_at, self.after_at = line.split('#', 1)
return self.before_at, self.after_at
def split_word_part(self, line):
self.word_part = line.split()
return self.word_part
def split_gloss_part(self, line):
self.gloss_part = line.strip().split('|')
return self.gloss_part
def process_lex_filenum(self, word_part):
self.lex_filenum = word_part[1]
return self.lex_filenum
class WordnetTestCase(unittest.TestCase):
def setUp(self):
self.wn = Wordnet()
def test_split_pointer_part(self):
line = 'foo#bar'
result = self.wn.split_pointer_part(line)
answer = ('foo', 'bar')
self.assertEqual(len(result), 2)
for r, a in zip(result, answer):
self.assertEqual(r, a)
if __name__ == '__main__':
unittest.main()