I have written an exporter in Python that converts json to metrics from Prometheus. It works with few data, but when I test this with very many datasets whose spacing is miliseconds, it stops working.
JSON (extract):
{
"Acquisition": {
"refTriggerName": "NO_REF_TRIGGER",
"refTriggerStamp": 1666592215243657724,
"channelTimeSinceRefTrigger": [0e+00, 2.5e-04, ...]
"channelValues": {
"values": [4.861855e+00, 4.8581786e+00,
...}
json_exporter.py:
from prometheus_client import start_http_server, Metric, REGISTRY
import json
import requests
import sys
import time
class JsonCollector(object):
def __init__(self, endpoint):
self._endpoint = endpoint
def collect(self):
# Fetch the JSON
response = json.loads(requests.get(self._endpoint).content.decode('UTF-8'))
metric = Metric('fair_acquisition_signal', 'single sinus signal example', 'gauge')
valuesArray = response['Acquisition']['channelValues']['values']
refTriggerStamp = response['Acquisition']['refTriggerStamp']
timestampArray = response['Acquisition']['channelTimeSinceRefTrigger']
counter = 0
while(counter < len(valuesArray)) and (counter < len(timestampArray)):
timestampV = refTriggerStamp/ 1e9 + timestampArray[counter]
metric.add_sample('fair_acquisition_signal', value=valuesArray[counter], timestamp=timestampV, labels={})
#print(str(datetime.fromtimestamp(timstampV)) + ' ' + str(valuesArray[counter]))
counter += 1
#for sample in metric.samples:
# print(sample)
#print(str(len(metric.samples)))
yield metric
if __name__ == '__main__':
# Usage: json_exporter.py port endpoint
start_http_server(int(sys.argv[1]))
REGISTRY.register(JsonCollector(sys.argv[2]))
while True: time.sleep(1)
The data should actually result in a sinus, but in Prometheus it looks like this.
visualization in Prometheus:
prometheus visualization
Does anyone know where my error is?
Related
This is quite a specific question regarding nohup in linux, which runs a python file.
Back-story, I am trying to save down streaming data (from IG markets broadcast signal). And, as I am trying to run it via a remote-server (so I don't have to keep my own local desktop up 24/7),
somehow, the nohup will not engage when it 'listen's to a broadcast signal.
Below, is the example python code
#!/usr/bin/env python
#-*- coding:utf-8 -*-
"""
IG Markets Stream API sample with Python
"""
user_ = 'xxx'
password_ = 'xxx'
api_key_ = 'xxx' # this is the 1st api key
account_ = 'xxx'
acc_type_ = 'xxx'
fileLoc = 'marketdata_IG_spx_5min.csv'
list_ = ["CHART:IX.D.SPTRD.DAILY.IP:5MINUTE"]
fields_ = ["UTM", "LTV", "TTV", "BID_OPEN", "BID_HIGH", \
"BID_LOW", "BID_CLOSE",]
import time
import sys
import traceback
import logging
import warnings
warnings.filterwarnings('ignore')
from trading_ig import (IGService, IGStreamService)
from trading_ig.lightstreamer import Subscription
cols_ = ['timestamp', 'data']
# A simple function acting as a Subscription listener
def on_prices_update(item_update):
# print("price: %s " % item_update)
print("xxxxxxxx
))
# A simple function acting as a Subscription listener
def on_charts_update(item_update):
# print("price: %s " % item_update)
print(xxxxxx"\
.format(
stock_name=item_update["name"], **item_update["values"]
))
res_ = [xxxxx"\
.format(
stock_name=item_update["name"], **item_update["values"]
).split(' '))]
# display(pd.DataFrame(res_))
try:
data_ = pd.read_csv(fileLoc)[cols_]
data_ = data_.append(pd.DataFrame(res_, columns = cols_))
data_.to_csv(fileLoc)
print('there is data and we are reading it')
# display(data_)
except:
pd.DataFrame(res_, columns = cols_).to_csv(fileLoc)
print('there is no data and we are saving first time')
time.sleep(60) # sleep for 1 min
def main():
logging.basicConfig(level=logging.INFO)
# logging.basicConfig(level=logging.DEBUG)
ig_service = IGService(
user_, password_, api_key_, acc_type_
)
ig_stream_service = IGStreamService(ig_service)
ig_session = ig_stream_service.create_session()
accountId = account_
################ my code to set sleep function to sleep/read at only certain time intervals
s_time = time.time()
############################
# Making a new Subscription in MERGE mode
subscription_prices = Subscription(
mode="MERGE",
# make sure to put L1 in front of the instrument name
items= list_,
fields= fields_
)
# adapter="QUOTE_ADAPTER")
# Adding the "on_price_update" function to Subscription
subscription_prices.addlistener(on_charts_update)
# Registering the Subscription
sub_key_prices = ig_stream_service.ls_client.subscribe(subscription_prices)
print('this is the line here')
input("{0:-^80}\n".format("HIT CR TO UNSUBSCRIBE AND DISCONNECT FROM \
LIGHTSTREAMER"))
# Disconnecting
ig_stream_service.disconnect()
if __name__ == '__main__':
main()
#######
Then, I try to run it on linux using this command : nohup python marketdata.py
where marketdata.py is basically the python code above.
Somehow, the nohup will not engage....... Any experts/guru who might see what I am missing in my code?
First one:
### configuration details
TELEGRAM_TOKEN = '' # telegram bot token
TELEGRAM_CHANNEL ='' # channel id
INTERVAL = '1m' # binance time interval
SHORT_EMA = 7 # short interval for ema
LONG_EMA = 21 # long interval for ema
Here is my second code:
import requests
import talib
import time
import numpy as np
import websocket
from config import TELEGRAM_TOKEN, TELEGRAM_CHANNEL , INTERVAL, SHORT_EMA , LONG_EMA
def streamKline(currency, interval):
websocket.enableTrace(False)
socket = f'wss://stream.binance.com:9443/ws/{currency}#kline_{interval}'
ws = websocket.WebSocketApp(socket)
ws.run_forever()
#SYMBOLS TO LOOK FOR ALERTS
SYMBOLS = [
"ETHUSDT",
"BTCUSDT",
"ATOMUSDT",
"BNBUSDT",
"FTMBUSD",
"ENJUSDT",
"WAXPUSDT"
]
#sending alerts to telegram
def send_message(message):
url = "https://api.telegram.org/bot{}/sendMessage?chat_id={}&text={}&parse_mode=markdown".format(TELEGRAM_TOKEN,TELEGRAM_CHANNEL,message)
res = requests.get(url);print(url);
return res
# getting klines data to process
def streamKline(symbol):
data = socket.streamKline(symbol=symbol,interval=INTERVAL,limit=300) # more data means more precision but at the trade off between speed and time
return_data = []
# taking closing data for each kline
for each in data:
return_data.append(float(each[4])) # 4 is the index of the closing data in each kline
return np.array(return_data) # returning as numpy array for better precision and performance
def main():
# making a infinite loop that keeps checking for condition
while True:
#looping through each coin
for each in SYMBOLS:
data = streamKline(each)
ema_short = talib.EMA(data,int(SHORT_EMA))
ema_long = talib.EMA(data,int(LONG_EMA))
last_ema_short = ema_short[-2]
last_ema_long = ema_long[-2]
ema_short = ema_short[-1]
ema_long = ema_long[-1]
# conditions for alerts
if(ema_short > ema_long and last_ema_short < last_ema_long):
message = each + "bullcoming "+ str(SHORT_EMA) + " over "+str(LONG_EMA);print(each ,"alert came");
send_message(message);
time.sleep(0.5);
# calling the function
if __name__ == "__main__":
main()
The part of config is all settle done, just second for the kline data, the error mention lot like this.
data = socket.streamKline(symbol=symbol,interval=INTERVAL,limit=300) # more data means more precision but at the
trade off between speed and time
NameError: name 'socket' is not defined
I just don't know how to do it, I want build a ema alert that can give me a message when I am not watching chart, through this way seems not work, I have tried many times, and also find many video but still, I am just an beginner, nothing improving at all.
Need some help to set the configuration for sasl.mechanism PLAIN (API) and GSSAPI (Kerberos) authentication.
We are using confluent Kafka here, there are two scripts, one a python script and the second one is a bash script which calls the python one. You can find the script below.
Thanks for the help in advance!
import json
import os
import string
import random
import socket
import uuid
import re
from datetime import datetime
import time
import hashlib
import math
import sys
from functools import cache
from confluent_kafka import Producer, KafkaError, KafkaException
topic_name = os.environ['TOPIC_NAME']
partition_count = int(os.environ['PARTITION_COUNT'])
message_key_template = json.loads(os.environ['KEY_TEMPLATE'])
message_value_template = json.loads(os.environ['VALUE_TEMPLATE'])
message_header_template = json.loads(os.environ['HEADER_TEMPLATE'])
bootstrap_servers = os.environ['BOOTSTRAP_SERVERS']
perf_counter_batch_size = int(os.environ.get('PERF_COUNTER_BATCH_SIZE', 100))
messages_per_aggregate = int(os.environ.get('MESSAGES_PER_AGGREGATE', 1))
max_message_count = int(os.environ.get('MAX_MESSAGE_COUNT', sys.maxsize))
def error_cb(err):
""" The error callback is used for generic client errors. These
errors are generally to be considered informational as the client will
automatically try to recover from all errors, and no extra action
is typically required by the application.
For this example however, we terminate the application if the client
is unable to connect to any broker (_ALL_BROKERS_DOWN) and on
authentication errors (_AUTHENTICATION). """
print("Client error: {}".format(err))
if err.code() == KafkaError._ALL_BROKERS_DOWN or \
err.code() == KafkaError._AUTHENTICATION:
# Any exception raised from this callback will be re-raised from the
# triggering flush() or poll() call.
raise KafkaException(err)
def acked(err, msg):
if err is not None:
print("Failed to send message: %s: %s" % (str(msg), str(err)))
producer_configs = {
'bootstrap.servers': bootstrap_servers,
'client.id': socket.gethostname(),
'error_cb': error_cb
}
# TODO: Need to support sasl.mechanism PLAIN (API) and GSSAPI (Kerberos) authentication.
# TODO: Need to support truststores for connecting to private DCs.
producer = Producer(producer_configs)
# generates a random value if it is not cached in the template_values dictionary
def get_templated_value(term, template_values):
if not term in template_values:
template_values[term] = str(uuid.uuid4())
return template_values[term]
def fill_template_value(value, template_values):
str_value = str(value)
template_regex = '{{(.+?)}}'
templated_terms = re.findall(template_regex, str_value)
for term in templated_terms:
str_value = str_value.replace(f"{{{{{term}}}}}", get_templated_value(term, template_values))
return str_value
def fill_template(template, templated_terms):
# TODO: Need to address metadata field, as it's treated as a string instead of a nested object.
return {field: fill_template_value(value, templated_terms) for field, value in template.items()}
#cache
def get_partition(lock_id):
bits = 128
bucket_size = 2**bits / partition_count
partition = (int(hashlib.md5(lock_id.encode('utf-8')).hexdigest(), 16) / bucket_size)
return math.floor(partition)
sequence_number = int(time.time() * 1000)
sequence_number = 0
message_count = 0
producing = True
start_time = time.perf_counter()
aggregate_message_counter = 0
# cache for templated term values so that they match across the different templates
templated_values = {}
try:
while producing:
sequence_number += 1
aggregate_message_counter += 1
message_count += 1
if aggregate_message_counter % messages_per_aggregate == 0:
# reset templated values
templated_values = {}
else:
for term in list(templated_values):
if term not in ['aggregateId', 'tenantId']:
del(templated_values[term])
# Fill in templated field values
message_key = fill_template(message_key_template, templated_values)
message_value = fill_template(message_value_template, templated_values)
message_header = fill_template(message_header_template, templated_values)
ts = datetime.utcnow().isoformat()[:-3]+'Z'
message_header['timestamp'] = ts
message_header['sequence_number'] = str(sequence_number)
message_value['timestamp'] = ts
message_value['sequenceNumber'] = sequence_number
lock_id = message_header['lock_id']
partition = get_partition(lock_id) # partition by lock_id, since key could be random, but a given aggregate_id should ALWAYS resolve to the same partition, regardless of key.
# Send message
producer.produce(topic_name, partition=partition, key=json.dumps(message_key), value=json.dumps(message_value), headers=message_header, callback=acked)
if sequence_number % perf_counter_batch_size == 0:
producer.flush()
end_time = time.perf_counter()
total_duration = end_time - start_time
messages_per_second=(perf_counter_batch_size/total_duration)
print(f'{messages_per_second} messages/second')
# reset start time
start_time = time.perf_counter()
if message_count >= max_message_count:
break
except Exception as e:
print(f'ERROR: %s' % e)
sys.exit(1)
finally:
producer.flush()
I am trying to fetch status of all the builds for all my jobs.I have written a script it takes way too much time to execute.Is there anyway I can optimize the script? Any help will be appreciated.
def jenkinsconn():
server = jenkins.Jenkins('server',username=username,password=password)
jobs = server.get_jobs()
job_name_list=[]
build_number_list=[]
build_info_list=[]
status_list_dict={}
success=0
failure=0
unstable=0
aborted=0
#print dir(server)
for i in range(len(jobs)):
job_name=jobs[i]['name']
job_name_list.append(job_name)
for i in range(len(job_name_list)):
job_info=server.get_job_info(job_name_list[i])
lastbuilt=job_info['lastSuccessfulBuild']
if lastbuilt:
b_number=job_info['lastSuccessfulBuild']['number']
build_number_list.append(b_number)
build_zipped=zip(job_name_list,build_number_list)
for i ,j in build_zipped:
success=0
failure=0
unstable=0
aborted=0
for k in range(j):
build_info=server.get_build_info(i,k+1)
build_info_list.append(build_info)
status=build_info['result']
if status=="SUCCESS":
success+=1
elif status=="FAILURE":
failure+=1
elif status=="UNSTABLE":
unstable+=1
else:
aborted+=1
statuscount=[success,failure,unstable,aborted]
status_list_dict[i]=statuscount
If you only need the number of builds succeeding, failing, etc. then you can make do with one request per job, rather than a request per build like it looks like your code is doing. I can't find an method in the python-jenkins module to do this, but you can do it yourself with the Jenkins API.
Eg:
try: # Python 3
from urllib.request import urlopen
from urllib.parse import quote
except ImportError: # Python 2
from urllib2 import urlopen, quote
import json
import contextlib
status_list_dict = {}
with contextlib.closing(
urlopen("http://HOST_NAME:8080/api/json")
) as job_list_response:
job_list = json.load(job_list_response)["jobs"]
for job in job_list:
status_counts = [0,0,0,0]
with contextlib.closing(
urlopen(
"http://HOST_NAME:8080/job/{job_name}/api/json?tree=allBuilds[result]".format(
job_name=quote(job["name"])
)
)
) as build_list_response:
build_list = json.load(build_list_response)["allBuilds"]
for build_data in build_list:
if build_data["result"] == "SUCCESS":
status_counts[0] += 1
elif build_data["result"] == "FAILURE":
status_counts[1] += 1
elif build_data["result"] == "UNSTABLE":
status_counts[2] += 1
elif build_data["result"] == "ABORTED":
status_counts[3] += 1
status_list_dict[job["name"]] = status_counts
Hi all,
I'm trying to parse the metadata of 10,000 websites into a Pandas dataframe for an SEO / analytics application but the code is taking ages. I've been trying to do it on 1,000 websites and the code has been running for the last 3 hours (it works without problem on 10-50 websites).
Here's the sample data:
index site
0 http://www.google.com
1 http://www.youtube.com
2 http://www.facebook.com
3 http://www.cnn.com
... ...
10000 http://www.sony.com
Here's my Python (2.7) code:
# Importing dependencies
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import metadata_parser
# Loading the Pandas dataframe
df = pd.read_csv('final_urls')
# Utility functions
def meta(website, metadata):
full_url = website
parser = metadata_parser.MetadataParser(url=full_url)
if metadata == 'all':
return parser.metadata
else:
return parser.metadata[metadata]
def meta_all(website):
try:
result = meta(website, 'all')
except BaseException:
result = 'Exception'
return result
# Main
df['site'].apply(meta_all)
I'd like the code to be much faster. I've been using the metadata_parser library (https://github.com/jvanasco/metadata_parser) which relies heavily on requests and BeautifulSoup.
I understand I might be able to change the parser to lxml for the code to be faster. It's already installed on my machine so BeautifulSoup should use it as the primary choice.
Do you have any suggestion to get this code to run faster?
Thanks!
You can use Python Twisted (Twisted is an event-driven networking engine written in Python). You will need to install a few packages with pip, maybe twisted, pyopenssl and service_identity maybe others. This code works on Python 2.7 which you say you are using.
from twisted.internet import defer, reactor
from twisted.web.client import getPage
import metadata_parser
import pandas as pd
import numpy as np
from multiprocessing import Process
def pageCallback(result, url):
data = {
'content': result,
'url': url,
}
return data
def getPageData(url):
d = getPage(url)
d.addCallback(pageCallback, url)
return d
def listCallback(result):
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=data['content'], search_head_only=False)
print(parser.metadata) # do something with it here
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk):
start(chunk)
reactor.run()
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = Process(target=processStart, args=(chunk,))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
I have run it on 10,000 URLs and it took less than 16 minutes.
Updated
Normally you would process the data you generated where I added the comment "# do something with it here". In the event you want the generated data returned back for processing you can do something like this (I have also updated to use treq.):
from twisted.internet import defer, reactor
import treq
import metadata_parser
import pandas as pd
import numpy as np
import multiprocessing
from twisted.python import log
import sys
# log.startLogging(sys.stdout)
results = []
def pageCallback(result, url):
content = result.content()
data = {
'content': content,
'url': url,
}
return data
def getPageData(url):
d = treq.get(url, timeout=60, headers={'User-Agent': ["Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv'\:'57.0) Gecko/20100101 Firefox/57.0"]})
d.addCallback(pageCallback, url)
return d
def listCallback(result):
global results
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=str(data['content']), search_head_only=False)
# print(parser.metadata) # do something with it here
results.append((data['url'], parser.metadata))
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk, returnList):
start(chunk)
reactor.run()
returnList.extend(results)
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
manager = multiprocessing.Manager()
returnList = manager.list()
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = multiprocessing.Process(target=processStart, args=(chunk,returnList))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
for res in returnList:
print (res)
print (len(returnList))
You may also want to add some error handling, to help you can uncomment the line reading "log.startLogging(sys.stdout)" but this is too much detail for one answer. If you get some failures for URLs I would generally retry them by running the code again with just the failed URLs possibly a few times if necessary.