i have problems with scrapy running multiple crawlers. What i want to achieve:
I have an engine running in the background taking tasks/jobs from the mysql database. Every 15 seconds the mysql db is queried. If there is a new job scrapy should process it.
My setup is working fine so far - the last problem is, that my spiders (scrapy spiders) get "stacked" in the CrawlerRunner.
Start:
def schedule():
jobs = GetJob.Job()
jobs.getJobs()
if __name__ == "__main__":
t = task.LoopingCall(schedule)
t.start(15)
reactor.run()
After GetJob.Job() the Jobs will be processed here:
class ProcessJob():
def processJob(self, job):
#update job
mysql = MysqlConnector.Mysql()
db = mysql.getConnection()
cur = db.cursor()
job.status = 1
update = "UPDATE job SET status=1 WHERE id=" + str(job.id)
cur.execute(update)
db.commit()
db.close()
#Start new crawler
webspider = MySpider.MySpider(job)
#Some settings
ajaxSettings = CrawlerSettings.ajax_settings
normalSettings = CrawlerSettings.normal_settings
configure_logging()
if job.max_pages != 0:
ajaxSettings["CLOSESPIDER_PAGECOUNT"] = 0
ajaxSettings["CLOSESPIDER_ITEMCOUNT"] = job.max_pages
normalSettings["CLOSESPIDER_PAGECOUNT"] = 0
normalSettings["CLOSESPIDER_ITEMCOUNT"] = job.max_pages
#max connections
concurrent_requests = int(job.max_pages / 20)
if concurrent_requests < 1:
concurrent_requests = 10
if concurrent_requests > 500:
concurrent_requests = 500
ajaxSettings["CONCURRENT_REQUESTS"] = concurrent_requests
normalSettings["CONCURRENT_REQUESTS"] = concurrent_requests
#Ajax true or false
if job.ajax == 1:
runner = CrawlerRunner(ajaxSettings)
else:
runner = CrawlerRunner(normalSettings)
d = runner.crawl(webspider, job=job)
And here is my spider:
class MySpider(CrawlSpider):
def __init__(self, job):
#Get the hosts
self.job = job
dispatcher.connect(self.spider_closed, signals.spider_closed)
allowedDomainsPre = job.url.split(",")
allowedDomains = []
for domains in allowedDomainsPre:
parsed_uri = urlparse(domains)
domain = '{uri.netloc}'.format(uri=parsed_uri)
print domain
allowedDomains.append(domain)
self.allowed_domains = allowedDomains
self.start_urls = allowedDomainsPre
#Get job patterns
jobPatterns = job.processing_patterns.split(",")
allowedPatterns = []
deniedPatterns = []
for pattern in jobPatterns:
if '-' in pattern:
deniedPatterns.append(pattern.replace("-", ""))
else:
allowedPatterns.append(pattern)
self._rules = [
Rule(LinkExtractor(allow=(allowedPatterns), deny=(deniedPatterns)), callback=self.parse_items, follow=True)
]
self.name = job.id
self.settings = CrawlerSettings.normal_settings
def spider_closed(self, spider):
stats = spider.crawler.stats.get_stats()
itemCount = 0
try:
itemCount = stats["item_scraped_count"]
except:
print "Item count = zero"
DoneJob.DoneJob().jobDone(self.job, itemCount)
def parse_items(self, response):
item = Item()
#if the user wants a minimum description
if self.job.min_description > 0:
item['html'] = response.body
item['url'] = response.url
item['job_id'] = self.job.id
soup = BeautifulSoup(response.body, 'html.parser')
article = Document(soup.prettify()).summary()
article_soup = BeautifulSoup(article)
text = re.sub(' +', ' ', article_soup.get_text().rstrip())
text_length = len(text.split(' '))
if text_length > self.job.min_description:
return item
else:
item['html'] = response.body
item['url'] = response.url
item['job'] = {}
#Job
item['job']['id'] = self.job.id
item['job']['user_id'] = self.job.user_id
item['job']['name'] = self.job.name
item['job']['url'] = self.job.url
item['job']['api'] = self.job.api
item['job']['max_pages'] = self.job.max_pages
item['job']['crawl_depth'] = self.job.crawl_depth
item['job']['processing_patterns'] = self.job.processing_patterns
item['job']['days'] = self.job.days
item['job']['ajax'] = self.job.ajax
item['job']['min_description'] = self.job.min_description
return item
So after running two or three jobs my spider_closed get called multiple times instead of (expected) once.
So what is wrong here?
Related
For some reason, in my fruit scraper, i cannot access anything from listify function.
I'am getting an error, for exmaple: NameError: name 'family' is not defined.
And i cant figure out what is wrong with my code - is my function is bad, or i'am doing something wrong with class ?
import requests
import json
import random
import pickle
class FruitScraper():
def __init__(self):
self.name = []
self.id = []
self.family = []
self.genus = []
self.order = []
self.carbohydrates = []
self.protein = []
self.fat = []
self.calories = []
self.sugar = []
def scrape_all_fruits(self):
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
def listify(self, stats):
alist = json.dumps(self.scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
self.name.append(i['name'])
self.id.append(i['id'])
self.family.append(i['family'])
self.genus.append(i['genus'])
self.order.append(i['order'])
self.carbohydrates.append(i['nutritions']['carbohydrates'])
self.protein.append(i['nutritions']['protein'])
self.fat.append(i['nutritions']['fat'])
self.calories.append(i['nutritions']['calories'])
self.sugar.append(i['nutritions']['sugar'])
except:
pass
return stats
def get_summary(self):
for i in self.listify(zip(self.fat, self.protein, self.calories, self.sugar, self.carbohydrates, self.name)):
nutr_stats = f'\nNutrients maximum statistics:\nFat: {max(self.fat)}\nProtein: {max(self.protein)}\nCarbohydrates: {max(self.carbohydrates)}\nCalories: {max(self.calories)}\nSugar: {max(self.sugar)}' \
f'\nNutrients minimum statistics:\nFat: {min(self.fat)}\nProtein: {min(self.protein)}\nCarbohydrates: {min(self.carbohydrates)}\nCalories: {min(self.calories)}\nSugar: {min(self.sugar)}' \
f'\nTotal fruits scraped: {len(self.name)}'
return nutr_stats
Scraped_info = FruitScraper().scrape_all_fruits()
Listified_info = FruitScraper().listify(family)
Fruits_statistics = FruitScraper().get_summary()
It's my first time doing OOP.
Please consider changing this
Scraped_info = FruitScraper().scrape_all_fruits()
Listified_info = FruitScraper().listify(family)
Fruits_statistics = FruitScraper().get_summary()
to
myScraper = FruitScraper()
Scraped_info = myScraper.scrape_all_fruits()
myScraper.listify()
Fruits_statistics = myScraper.get_summary()
Otherwise you create three different objects of this class and discard them with all their attributes after running the individual method once.
This might also be critical to define family in this line of the code:
Listified_info = myScraper.listify(family)
But I can't see how you intended to use the parameter stats in your method listify(). It is just received and returned. I suggest that you change:
def listify(self, stats):
to
def listify(self):
and remove
return stats
If you want to get those lists inside the object of this class returned by listify(), you may do the following (but this is not OOP way of doing things):
import requests
import json
import copy
class FruitScraper():
def __init__(self):
self.name = []
self.id = []
self.family = []
self.genus = []
self.order = []
self.carbohydrates = []
self.protein = []
self.fat = []
self.calories = []
self.sugar = []
def collect_all_lists(self):
self.allLists = dict('name': self.name, 'id': self.id, 'family': self.family, 'genus': self.genus, 'order': self.order, 'carbohydrates': self.carbohydrates, 'protein': self.protein, 'fat': self.fat, 'calories': self.calories, 'sugar': self.sugar)
def scrape_all_fruits(self):
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
def listify(self):
alist = json.dumps(self.scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
self.name.append(i['name'])
self.id.append(i['id'])
self.family.append(i['family'])
self.genus.append(i['genus'])
self.order.append(i['order'])
self.carbohydrates.append(i['nutritions']['carbohydrates'])
self.protein.append(i['nutritions']['protein'])
self.fat.append(i['nutritions']['fat'])
self.calories.append(i['nutritions']['calories'])
self.sugar.append(i['nutritions']['sugar'])
except:
pass
self.collect_all_lists()
return copy.deepcopy(self.allLists)
def get_summary(self):
for i in self.listify(zip(self.fat, self.protein, self.calories, self.sugar, self.carbohydrates, self.name)):
nutr_stats = f'\nNutrients maximum statistics:\nFat: {max(self.fat)}\nProtein: {max(self.protein)}\nCarbohydrates: {max(self.carbohydrates)}\nCalories: {max(self.calories)}\nSugar: {max(self.sugar)}' \
f'\nNutrients minimum statistics:\nFat: {min(self.fat)}\nProtein: {min(self.protein)}\nCarbohydrates: {min(self.carbohydrates)}\nCalories: {min(self.calories)}\nSugar: {min(self.sugar)}' \
f'\nTotal fruits scraped: {len(self.name)}'
return nutr_stats
myScraper = FruitScraper()
Scraped_info = myScraper.scrape_all_fruits()
Listified_info = myScraper.listify()
Fruits_statistics = myScraper.get_summary()
I am writing a program that reads data from an excel file to some system via API.
The function that fetches data from excel and sends it via API is in a separate thread to keep the GUI alive.
Loading via API is a long process. Most of the time is sending and waiting for API data.
I wanted to send each excel line in a separate process. How to do it?
def api_import(k, sheet,number_columns, items_number, ok, total, total_success):
item = sheet[get_column_letter(3)+str(k)].value
templ_id = sheet[get_column_letter(number_columns)+str(k)].value
item_id = get_item_id(item)['item_id']
query = {"template_id": str(templ_id)}
resp = templ_attach(item_id, query)
if resp==200:
ok += 1
total += 1
item_create_param = 0
item_update_param = 0
item_error = 0
for l in range(number_columns):
if l <4:
pass
else:
param_name = sheet[get_column_letter(l)+str(1).upper().replace(" ", "")].value
param_description = sheet[get_column_letter(l)+str(2)].value
param_value = sheet[get_column_letter(l)+str(k)].value
param_type = sheet[get_column_letter(l)+str(3)].value
if "[T]" in param_type:
param_type = 4
query = {"name": str(param_name), "type": param_type, "description": str(param_description), "text_value": str(param_value)}
elif "[D]" in param_type:
param_type = 0
query = {"name": str(param_name), "type": param_type, "description": str(param_description), "numeric_value": param_value}
else:
param_type = 1
query = {"name": str(param_name), "type": param_type, "description": str(param_description), "logical_value": param_value}
resp = input_param(item_id, query)
if resp==201:
item_create_param += 1
elif resp==202:
item_update_param += 1
else:
item_error += 1
if item_error == 0:
total_success += 1
return {'total':total, 'total_success':total_success, 'ok':ok}
class MainWindow(QDialog):
def __init__(self):
super(MainWindow,self).__init__()
loadUi("gui.ui",self)
self.browse.clicked.connect(self.browsefiles)
self.load.clicked.connect(self.send)
def browsefiles(self):
fname=QFileDialog.getOpenFileName(self, 'Open file', 'C:\Program files', 'Excel Spreadsheet files (*.xlsx, *.xls)')
self.filename.setText(fname[0])
def send(self):
self.worker = WorkerThread()
self.worker.start()
#self.worker.worker_complete.connect(self.evt_worker_finished)
self.worker.update_progress.connect(self.evt_update_progress)
#def evt_worker_finished(self, emp):
#self.QMessageBox.information(self, "Done!", "Wczytywanie zakonczone!\n\n{} {}".format(emp["fn"], emp["ln"]))
def evt_update_progress(self, val):
self.progressBar.setValue(val)
class WorkerThread(QThread):
update_progress = pyqtSignal(int)
worker_complete = pyqtSignal(dict)
def run(self):
start_time = time.time()
file_path = "import.xlsx"
print(file_path)
wb = load_workbook(file_path)
sheetnames = wb.sheetnames
for sheet in wb:
print(sheet)
number_rows = sheet.max_row
print(number_rows)
number_columns = sheet.max_column
items_number = number_rows-3
ok = 0
total = 0
progress = 0
total_success = 0
for k in range(number_rows+1):
if k>3:
report = api_import(k, sheet,number_columns, items_number, ok, total, total_success)
total = report['total']
ok = report['ok']
total_success = report['total_success']
progress = round(total/items_number*100)
print("{}%".format(progress))
self.update_progress.emit(progress)
self.worker_complete.emit({"emp_id":1234, "fn":"XXX", "ln":"YYYY"})
end_time = time.time() - start_time
item_time = end_time/total
I'm unable to generate all entries in Kaltura. An ApiException with the message "Unable to generate list. max matches value was reached" (Error: QUERY_EXCEEDED_MAX_MATCHES_ALLOWED) gets triggered.
I tried to work around such issue by setting my sessionPrivileges to disableentitlement
class class_chk_integrity():
client = None
pagesize = 0
def __init__(self,worker_num, progress):
self.pagesize = 30
self.worker_num = worker_num
self.progress = progress
config = KalturaConfiguration(2723521)
config.serviceUrl = "https://www.kaltura.com/"
self.client = KalturaClient(config)
ks = self.client.session.start("KALTURA_ADMIN_SECRET",
"email#email.com",
KalturaPluginsCore.KalturaSessionType.ADMIN,
"KALTURA_PARTNER_ID",
432000,
"disableentitlement")
self.client.setKs(ks)
I also tried to filter based on the id's. However, I can't manage to put the filter.idNotIn to work properly.
def get_total_reg(self, cont, lastEntryIds, lastEntryCreatedAt):
filter = KalturaPluginsCore.KalturaBaseEntryFilter()
if lastEntryIds != "":
filter.idNotIn = lastEntryIds
filter.orderBy = KalturaBaseEntryOrderBy.CREATED_AT_DESC
pager = KalturaPluginsCore.KalturaFilterPager()
pageIndex = 1
entriesGot = 0
pager.pageSize = self.pagesize
pager.setPageIndex = pageIndex
result = self.client.baseEntry.list(filter, pager)
totalCount = result.totalCount
if totalCount > 10000:
totalCount = 9970
if totalCount <= 0:
cont = False
while entriesGot < totalCount:
pager.pageSize = self.pagesize
pageIndex += 1
pager.pageIndex = pageIndex
result = self.client.baseEntry.list(filter, pager)
entriesGot += len(result.objects)
for e in result.objects:
if lastEntryIds == "":
lastEntryIds.append(e.id)
else:
lastEntryIds.append(e.id)
lastEntryCreatedAt = e.createdAt
return result.totalCount, self.pagesize, cont, lastEntryIds, lastEntryCreatedAt
This is my how I'm calling the functions
if __name__ == '__main__':
try:
log = _ServiceUtils.log()
log.setup('all', 'integrity')
cont = True
lastEntryIds = []
lastEntryCreatedAt = 0
while cont is True:
kmc = class_chk_integrity(0,0)
kmc_total_reg, kmc_page_size, cont, lastEntryIds, lastEntryCreatedAt = kmc.get_total_reg(cont, lastEntryIds, lastEntryCreatedAt)
interval = 10
max_threads = math.ceil(kmc_total_reg / (interval * kmc_page_size))
# max_threads = 1
threads_list = []
print('TOTAL REG : %s | PAGE_SIZE : %s | INTERVAL : %s | THREADS : %s' % (kmc_total_reg,kmc_page_size,interval,max_threads))
progress = class_progress_thread(max_threads)
for index in range(0,max_threads):
page_ini = index * interval
page_end = index * interval + interval
progress.add_worker_progress(index,datetime.now())
threads_list.append(threading.Thread(target=thread_chk_integrity, args=(index, log, index * interval + 1,index * interval + interval,progress)))
threads_list.append(threading.Thread(target=thread_output_progress, args=(progress,max_threads)))
for thread in threads_list:
thread.start()
for thread in threads_list:
thread.join()
while not progress.stop(): time.sleep(30)
except KeyboardInterrupt:
try:
sys.exit(0)
except SystemExit:
os._exit(0)
I'd appreciate any help with this.
Thank you for your attention.
if totalCount > 10000:
totalCount = 9970
I'm curious to know why you are changing the totalCount this way.
Short answer - paging works as long as the result set is up to 10K.
To work around that, sort the result by creation date (as you did), and when you get to 10K, start with a new search where the created_at date in the filter is the last value you got in the previous search. Reset your paging of course.
I have a Django function that takes in a Nessus file and then parses the data before saving it to the database, my Nessus file typically has about 30k rows and saving this to the database can take as much as 2 hours, I have tried using bulk_create but this breaks the code, meanwhile I use Django 1.11, is there a way I can speed up these large inserts to the database (postgres)
Here is my code:
def process_nessus_file(*args, **kwargs):
process_obj = kwargs.get('file')
context = kwargs.get('context')
request = kwargs.get('request')
file_obj = process_obj.first()
file_path = file_obj.file.path
context = etree.iterparse(
file_path,
events=('end', ),
tag="ReportHost"
)
total_issues = 0
detected_issues = 0
undetected_issues = 0
already_exist_issue = 0
low_risk_count = 0
medium_risk_count = 0
high_risk_count = 0
critical_risk_count = 0
low_new_issue = 0
medium_new_issue = 0
high_new_issue = 0
critical_new_issue = 0
vul_history = []
for event, elem in context:
first_identified = None
last_seen = None
host = elem.get('name')
logger.info('Processing issue for host : {}'.format(host))
for child in elem:
if child.tag == "HostProperties":
for host_prop_tags in child:
if host_prop_tags.attrib['name'] == "HOST_START":
first_identified = host_prop_tags.text
elif host_prop_tags.attrib['name'] == "HOST_END":
last_seen = host_prop_tags.text
if child.tag == "ReportItem":
main_tags = child.attrib
child_tags = dict()
for ch_tags in child:
if ch_tags.text:
tag_text = ch_tags.text.strip()
else:
tag_text = ch_tags.text
child_tags[ch_tags.tag] = tag_text
if child_tags.get('solution') and \
child_tags.get('solution') in ['n/a', 'N/A']:
child_tags['solution'] = ''
plugin_output = child_tags.get('plugin_output')
pluginid = int(main_tags.get('pluginID'))
if plugin_output and (pluginid == 10107):
if re.search(BANNER_PATTERN, plugin_output):
banner_pattern = plugin_output.replace("{}".\
format(BANNER_PATTERN), "")
banner = banner_pattern.strip()
else:
banner = ''
else:
banner = ''
risk = child_tags.get('risk_factor')
synopsis = child_tags.get('synopsis')
description = child_tags.get('description')
solution = child_tags.get('solution')
protocol = main_tags.get('protocol')
port = main_tags.get('port')
pluginname = main_tags.get('pluginName')
svcname = main_tags.get('svc_type')
try:
host_type = get_host_type(host)
user_host = check_host_exists(host, host_type)
if user_host and not NessusData.objects.filter(
plugin_id=int(pluginid), host=host,
port=int(port), name=pluginname
).exists():
try:
host_link_obj = Host.objects.get(
host=host
)
except Host.MultipleObjectsReturned:
host_link_obj = host.objects.filter(
host=host
).first()
except Host.DoesNotExist:
host_link_obj = Host.objects.create(
host=host,
user_host=user_host
)
nessus_obj = NessusFile.objects.create(
user_host=user_host,
host_link=host_link_obj,
linked_file=file_obj,
plugin_id=int(pluginid),
risk=risk, host=host,
protocol=protocol, port=int(port),
banner=banner, name=pluginname,
svc_type=svcname,
description=description,
first_identified=first_identified,
last_seen=last_seen,
synopsis=synopsis,
plugin_output=plugin_output,
solution=solution
)
issue = "Issue with host {}, port {} and"\
" pluginID {} is added.".\
format(
nessus_obj.host, nessus_obj.port,
nessus_obj.plugin_id
)
NessusFileLog.objects.create(
linked_file=file_obj,
issue_type="new",
issue=issue
)
detected_issues = detected_issues + 1
if risk == 'Medium':
medium_new_issue = medium_new_issue + 1
elif risk == 'Low':
low_new_issue = low_new_issue + 1
elif risk == 'High':
high_new_issue = high_new_issue + 1
elif risk == 'Critical':
critical_new_issue = critical_new_issue + 1
else:
nessus_obj = NessusFile.objects.filter(
plugin_id=int(pluginid), host=host,
port=int(port), name=pluginname
).first()
if nessus_obj and not nessus_obj.last_seen:
nessus_obj.last_seen = last_seen
nessus_obj.save()
issue = "Issue with host {}, port {} and"\
" pluginID {} is already exists.".\
format(host,port, pluginid)
NessusFileLog.objects.create(
linked_file=file_obj,
issue_type="duplicate",
issue=issue
)
already_exist_issue = already_exist_issue + 1
except Exception as e:
pass
if risk == 'Medium':
medium_risk_count = medium_risk_count + 1
elif risk == 'Low':
low_risk_count = low_risk_count + 1
elif risk == 'High':
high_risk_count = high_risk_count + 1
elif risk == 'Critical':
critical_risk_count = critical_risk_count + 1
total_issues = total_issues + 1
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
I heard using raw sql queries will speed it up but I cant wrap my head around the process
I am having some trouble understanding json dictionary and array. I have a script that is scraping information from a website.
models.txt is just a list of model numbers such as
30373
30374
30375
and json_descriptions.txt is a list of the keys I want
sku
price
listprice
issoldout
The code is:
import urllib
import re
import json
modelslist = open("models.txt").read()
modelslist = modelslist.split("\n")
descriptionlist = open("json_descriptions.txt").read()
descriptionlist = descriptionlist.split("\n")
for model in modelslist:
htmltext = urllib.urlopen("http://dx.com/p/GetProductInfoRealTime?skus="+model)
htmltext = json.load(htmltext)
if htmltext['success'] == True:
def get_data(dict_index, key):
return htmltext[u"data"][dict_index][key]
for description in descriptionlist:
info = description, (get_data(0,description))
print info
else:
print "product does not exist"
If I print out info I get:
sku 30373
price 9.10
listprice 17.62
issoldout False
so that means info[0] is:
sku
price
listprice
issoldout
and info[1] is:
30373
9.10
17.62
False
I would like to know if there is a way that I can have this:
loop 1 = ['sku','30373','price','4.90','listprice','0','issoldout','False']
loop 2 = ['sku','30374','price','10.50','listprice','0','issoldout','False']
info[0] = sku info[1] = 30373 info[2] = price info[3] = 4.90 info[4] = listprice info[5] = 0 info[6] = issoldout info[7] = False and then repeat that with a new list for the next loop through.
I have tried using info = json.dumps(info) but that just gives info[0] = [[[[ and info[1] = """" info[2] = spli and so on
Like this?
info = []
for model in modelslist:
htmltext = urllib.urlopen("http://dx.com/p/GetProductInfoRealTime?skus="+model)
htmltext = json.load(htmltext)
if htmltext['success'] == True:
def get_data(dict_index, key):
return htmltext[u"data"][dict_index][key]
for description in descriptionlist:
info.append(description)
info.append(get_data(0,description))
print info