How do I save data from websocket? - python

I have successfully subscribed to a websocket and am receiving data. I am waiting to save my data so I can use it in a data frame for further analysis.
My code so far is only returning empty lists and dataframes.
Code:
RETURNS EMPTY LIST
wsClient = GDAX.WebsocketClient(url="wss://ws-feed.gdax.com", products="LTC-USD")
df1 = []
for i in wsClient.start():
df1 = df1.append(wsClient.start())
Code:
RETURNS EMPTY LIST AND DATAFRAME
wsClient = GDAX.WebsocketClient(url="wss://ws-feed.gdax.com", products="LTC-USD")
dfs = []
for i in wsClient.start():
dfs.append(wsClient.start())
df1 = pd.concat(dfs)

You need to implement your own custom on_message method to be able to get the websocket information:
import time
import gdax
import pandas as pd
results = []
class myWebsocketClient(gdax.WebsocketClient):
def on_open(self):
self.url = "wss://ws-feed.gdax.com/"
self.products = ["LTC-USD"]
def on_message(self, msg):
if 'price' in msg and 'type' in msg:
results.append(msg['price'])
wsClient = myWebsocketClient()
wsClient.start()
time.sleep(5)
df = pd.DataFrame(results, columns = ["Price"])
print(df.head())
wsClient.close()
This will run for 5 seconds, and outputs:
Price
0 153.13000000
1 151.14000000
2 140.52000000
3 140.52000000
4 152.62000000
-- Socket Closed --

Related

Adding Column to data frame based on list content in a loop? - Python

I'm pulling data from the NHL API for player stats based on individual games. I'm trying to make a loop that calls the data, parses the JSON, creates a dict which I then can create a data frame from for an entire team. The code before my looping looks like this:
API_URL = "https://statsapi.web.nhl.com/api/v1"
response = requests.get(API_URL + "/people/8477956/stats?stats=gameLog", params={"Content-Type": "application/json"})
data = json.loads(response.text)
df_list_dict = []
for game in data['stats'][0]['splits']:
curr_dict = game['stat']
curr_dict['date'] = game['date']
curr_dict['isHome'] = game['isHome']
curr_dict['isWin'] = game['isWin']
curr_dict['isOT'] = game['isOT']
curr_dict['team'] = game['team']['name']
curr_dict['opponent'] = game['opponent']['name']
df_list_dict.append(curr_dict)
df = pd.DataFrame.from_dict(df_list_dict)
print(df)
This gives me a digestible data frame for a single player. (/people/{player}/....
I want to iterate through a list (the list being an NHL team), while adding a column that identifies the player and concatenates the created data frames. My attempt thus far looks like this:
import requests
import json
import pandas as pd
Rangers = ['8478550', '8476459', '8479323', '8476389', '8475184', '8480817', '8480078', '8476624', '8481554', '8482109', '8476918', '8476885', '8479324',
'8482073', '8479328', '8480833', '8478104', '8477846', '8477380', '8477380', '8477433', '8479333', '8479991']
def callapi(player):
response = (requests.get(f'https://statsapi.web.nhl.com/api/v1/people/{player}/stats?stats=gameLog', params={"Content-Type": "application/json"}))
data = json.loads(response.text)
df_list_dict = []
for game in data['stats'][0]['splits']:
curr_dict = game['stat']
curr_dict['date'] = game['date']
curr_dict['isHome'] = game['isHome']
curr_dict['isWin'] = game['isWin']
curr_dict['isOT'] = game['isOT']
curr_dict['team'] = game['team']['name']
curr_dict['opponent'] = game['opponent']['name']
df_list_dict.append(curr_dict)
df = pd.DataFrame.from_dict(df_list_dict)
print(df)
for player in Rangers:
callapi(player)
print(callapi)
When this is printed I can see all the data frames that were created. I cannot use curr_dict[] to add a column based on the list position (the player ID) because must be a slice or integer, not string.
What I'm hoping to do is make this one data frame in which the stats are identified by a player id column.
My python knowledge is very scattered, I feel as if with the progress I've made I should know how to complete this but I've simply hit a wall. Any help would be appreciated.
You can use concurrent.futures to parallelize the requests before concatenating them all together, and json_normalize to parse the json.
import concurrent.futures
import json
import os
import pandas as pd
import requests
class Scrape:
def main(self) -> pd.DataFrame:
rangers = ["8478550", "8476459", "8479323", "8476389", "8475184", "8480817", "8480078",
"8476624", "8481554", "8482109", "8476918", "8476885", "8479324", "8482073",
"8479328", "8480833", "8478104", "8477846", "8477380", "8477380", "8477433",
"8479333", "8479991"]
with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
return pd.concat(executor.map(self.get_stats, rangers)).reset_index(drop=True).fillna(0)
#staticmethod
def get_stats(player: str) -> pd.DataFrame:
url = f"https://statsapi.web.nhl.com/api/v1/people/{player}/stats?stats=gameLog"
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
data = json.loads(response.text)
df = (pd.
json_normalize(data=data, record_path=["stats", "splits"])
.rename(columns={"team.id": "team_id", "team.name": "team_name",
"opponent.id": "opponent_id", "opponent.name": "opponent_name"})
).assign(player_id=player)
df = df[df.columns.drop(list(df.filter(regex="link|gamePk")))]
df.columns = df.columns.str.split(".").str[-1]
if "faceOffPct" not in df.columns:
df["faceOffPct"] = 0
return df
if __name__ == "__main__":
stats = Scrape().main()
print(stats)

Python API Call: JSON to Pandas DF

I'm working on pulling data from a public API and converting the response JSON file to a Pandas Dataframe. I've written the code to pull the data and gotten a successful JSON response. The issue I'm having is parsing through the file and converting the data to a dataframe. Whenever I run through my for loop, I get a dataframe that retruns 1 row when it should be returning approximately 2500 rows & 6 columns. I've copied and pasted my code below:
Things to note:
I've commented out my api key with "api_key".
I'm new(ish) to python so I understand that my code formatting might not be best practices. I'm open to changes.
Here is the link to the API that I am requesting from: https://developer.va.gov/explore/facilities/docs/facilities?version=current
facilities_data = pd.DataFrame(columns=['geometry_type', 'geometry_coordinates', 'id', 'facility_name', 'facility_type','facility_classification'])
# function that will make the api call and sort through the json data
def get_facilities_data(facilities_data):
# Make API Call
res = requests.get('https://sandboxapi.va.gov/services/va_facilities/v0/facilities/all',headers={'apikey': 'api_key'})
data = json.loads(res.content.decode('utf-8'))
time.sleep(1)
for facility in data['features']:
geometry_type = data['features'][0]['geometry']['type']
geometry_coordinates = data['features'][0]['geometry']['coordinates']
facility_id = data['features'][0]['properties']['id']
facility_name = data['features'][0]['properties']['name']
facility_type = data['features'][0]['properties']['facility_type']
facility_classification = data['features'][0]['properties']['classification']
# Save data into pandas dataframe
facilities_data = facilities_data.append(
{'geometry_type': geometry_type, 'geometry_coordinates': geometry_coordinates,
'facility_id': facility_id, 'facility_name': facility_name, 'facility_type': facility_type,
'facility_classification': facility_classification}, ignore_index=True)
return facilities_data
facilities_data = get_facilities_data(facilities_data)
print(facilities_data)```
As mentioned, you should
loop over facility instead of data['features'][0]
append within the loop
This will get you the result you are after.
facilities_data = pd.DataFrame(columns=['geometry_type', 'geometry_coordinates', 'id', 'facility_name', 'facility_type','facility_classification'])
def get_facilities_data(facilities_data):
# Make API Call
res = requests.get("https://sandbox-api.va.gov/services/va_facilities/v0/facilities/all",
headers={"apikey": "REDACTED"})
data = json.loads(res.content.decode('utf-8'))
time.sleep(1)
for facility in data['features']:
geometry_type = facility['geometry']['type']
geometry_coordinates = facility['geometry']['coordinates']
facility_id = facility['properties']['id']
facility_name = facility['properties']['name']
facility_type = facility['properties']['facility_type']
facility_classification = facility['properties']['classification']
# Save data into pandas dataframe
facilities_data = facilities_data.append(
{'geometry_type': geometry_type, 'geometry_coordinates': geometry_coordinates,
'facility_id': facility_id, 'facility_name': facility_name, 'facility_type': facility_type,
'facility_classification': facility_classification}, ignore_index=True)
return facilities_data
facilities_data = get_facilities_data(facilities_data)
print(facilities_data.head())
There are some more things we can improve upon;
json() can be called directly on requests output
time.sleep() is not needed
appending to a DataFrame on each iteration is discouraged; we can collect the data in another way and create the DataFrame afterwards.
Implementing these improvements results in;
def get_facilities_data():
data = requests.get("https://sandbox-api.va.gov/services/va_facilities/v0/facilities/all",
headers={"apikey": "REDACTED"}).json()
facilities_data = []
for facility in data["features"]:
facility_data = (facility["geometry"]["type"],
facility["geometry"]["coordinates"],
facility["properties"]["id"],
facility["properties"]["name"],
facility["properties"]["facility_type"],
facility["properties"]["classification"])
facilities_data.append(facility_data)
facilities_df = pd.DataFrame(data=facilities_data,
columns=["geometry_type", "geometry_coords", "id", "name", "type", "classification"])
return facilities_df

How can I split a batch string message received from Azure Service Bus to row by row?

I'm a beginner in python, I have an Azure function that runs with a time trigger. This function reads a batch of raw JSON data from an Azure service bus with string format.
This is a two-row of data. In reality, I received about 50 like this message is continuous. Now I want to split this message row by row and then archive it to Azure Storage.
The message is like the below sample ( concat of row1 and row2 ) :
{"Name":"","Seri":21000000,"SiName":"","As":"","PId":21070101,"ICheck":0,"SeeNum":405097041391424,"Type":0,"Counter":33,"PaId":0,"MeType":30,"RecTime":"2021-10-21T09:04:41.0151Z","ReaTime":null,"Cape":"2021-10-21T09:04:40.644","Status":0,"text":"{\"TYPE_TAG\":\"00\",\"ENSORAG\":{\"date_time\":\"2021-10-21 09:04:40.644\",\"seber\":10,\"seqmber\":405097041391424,\"lo_name\":\"\",\"accati\":{\"0\":0.0,\"1\":-0.037665367,\"2\":-0.033863068,\"3\":-0.026795387,\"4\":-0.03757,\"5\":-0.02809906,\"6\":-0.016090393,\"7\":-0.040496826,\"8\":-0.05318451,\"9\":-0.025012016,\"10\":-0.057872772}},\"ATTACHED_DEVICE_SERIAL_NUMBER_TAG\":\"21000000\",\"error\":{}}","CerId":null,"Id":null,"Asse":null,"Id":0,"id":"075f0a38-2816-42c7-b95c-66c425b8ba9d","t":-1}{"Name":"","Seri":21000000,"SiName":"","As":"","PId":21070101,"ICheck":0,"SeeNum":405097041391424,"Type":0,"Counter":33,"PaId":0,"MeType":30,"RecTime":"2021-10-21T09:04:41.0151Z","ReaTime":null,"Cape":"2021-10-21T09:04:40.644","Status":0,"text":"{\"TYPE_TAG\":\"00\",\"ENSORAG\":{\"date_time\":\"2021-10-21 09:04:40.644\",\"seber\":10,\"seqmber\":405097041391424,\"lo_name\":\"\",\"accati\":{\"0\":0.0,\"1\":-0.037665367,\"2\":-0.033863068,\"3\":-0.026795387,\"4\":-0.03757,\"5\":-0.02809906,\"6\":-0.016090393,\"7\":-0.040496826,\"8\":-0.05318451,\"9\":-0.025012016,\"10\":-0.057872772}},\"NUMBER_TAG\":\"21000000\",\"error\":{}}","CerId":null,"Id":null,"Asse":null,"Id":0,"id":"075f0a38-2816-42c7-b95c-66c425b8ba9d","t":-1}{"Name":"","Seri":4560000,"SiName":"","As":"","PId":2107401,"ICheck":0,"SeeNum":40509704561424,"Type":0,"Counter":34,"PaId":0,"MeType":31,"RecTime":"2021-10-21T09:04:41.0151Z","ReaTime":null,"Cape":"2021-10-21T09:04:40.644","Status":0,"text":"{\"TYPE_TAG\":\"00\",\"ENSORAG\":{\"date_time\":\"2021-10-21 09:04:40.644\",\"seber\":10,\"seqmber\":405097041391424,\"lo_name\":\"\",\"accati\":{\"0\":0.0,\"1\":-0.037665367,\"2\":-0.033863068,\"3\":-0.026795387,\"4\":-0.03757,\"5\":-0.02809906,\"6\":-0.016090393,\"7\":-0.040496826,\"8\":-0.05318451,\"9\":-0.025012016,\"10\":-0.057872772}},\"ATTACHED_DEVICE_SERIAL_NUMBER_TAG\":\"21000000\",\"error\":{}}","CerId":null,"Id":null,"Asse":null,"Id":0,"id":"075f0a38-2816-42c7-b95c-66c425b8ba9d","t":-1}{"Name":"","Seri":21000000,"SiName":"","As":"","PId":21070101,"ICheck":0,"SeeNum":405097041391424,"Type":0,"Counter":33,"PaId":0,"MeType":30,"RecTime":"2021-10-21T09:04:41.0151Z","ReaTime":null,"Cape":"2021-10-21T09:04:40.644","Status":0,"text":"{\"TYPE_TAG\":\"00\",\"ENSORAG\":{\"date_time\":\"2021-10-21 09:04:40.644\",\"seber\":10,\"seqmber\":405097041391424,\"lo_name\":\"\",\"accati\":{\"0\":0.0,\"1\":-0.037665367,\"2\":-0.033863068,\"3\":-0.026795387,\"4\":-0.03757,\"5\":-0.02809906,\"6\":-0.016090393,\"7\":-0.040496826,\"8\":-0.05318451,\"9\":-0.0254566,\"10\":-0.054562772}},\"NUMBER_TAG\":\"2145600\",\"error\":{}}","CerId":null,"Id":null,"Asse":null,"Id":1,"id":"074222a38-2816-42c7-b95c-6644448ba9d","t":-2}
Row 1 is:
{"Name":"","Seri":21000000,"SiName":"","As":"","PId":21070101,"ICheck":0,"SeeNum":405097041391424,"Type":0,"Counter":33,"PaId":0,"MeType":30,"RecTime":"2021-10-21T09:04:41.0151Z","ReaTime":null,"Cape":"2021-10-21T09:04:40.644","Status":0,"text":"{\"TYPE_TAG\":\"00\",\"ENSORAG\":{\"date_time\":\"2021-10-21 09:04:40.644\",\"seber\":10,\"seqmber\":405097041391424,\"lo_name\":\"\",\"accati\":{\"0\":0.0,\"1\":-0.037665367,\"2\":-0.033863068,\"3\":-0.026795387,\"4\":-0.03757,\"5\":-0.02809906,\"6\":-0.016090393,\"7\":-0.040496826,\"8\":-0.05318451,\"9\":-0.025012016,\"10\":-0.057872772}},\"ATTACHED_DEVICE_SERIAL_NUMBER_TAG\":\"21000000\",\"error\":{}}","CerId":null,"Id":null,"Asse":null,"Id":0,"id":"075f0a38-2816-42c7-b95c-66c425b8ba9d","t":-1}
Row 2 is:
{"Name":"","Seri":4560000,"SiName":"","As":"","PId":2107401,"ICheck":0,"SeeNum":40509704561424,"Type":0,"Counter":34,"PaId":0,"MeType":31,"RecTime":"2021-10-21T09:04:41.0151Z","ReaTime":null,"Cape":"2021-10-21T09:04:40.644","Status":0,"text":"{\"TYPE_TAG\":\"00\",\"ENSORAG\":{\"date_time\":\"2021-10-21 09:04:40.644\",\"seber\":10,\"seqmber\":405097041391424,\"lo_name\":\"\",\"accati\":{\"0\":0.0,\"1\":-0.037665367,\"2\":-0.033863068,\"3\":-0.026795387,\"4\":-0.03757,\"5\":-0.02809906,\"6\":-0.016090393,\"7\":-0.040496826,\"8\":-0.05318451,\"9\":-0.025012016,\"10\":-0.057872772}},\"ATTTAG\":\"21000000\",\"error\":{}}","CerId":null,"Id":null,"Asse":null,"Id":0,"id":"075f0a38-2816-42c7-b95c-66c425b8ba9d","t":-2}
The structure of a row is like the below image:
In my opinion, First I should split each row and then create a data frame and insert each value in the related column. After that, I append to a blob. Is it right?
How can I do? What is your suggested solution?
Edited:
My code for reading from service bus:
from azure.servicebus import ServiceBusClient, ServiceBusMessage
connection_str = "**"
topic_name = "***"
subscription_name = "***"
servicebus_client = ServiceBusClient.from_connection_string(
conn_str=connection_str, logging_enable=True)
with servicebus_client:
# get the Subscription Receiver object for the subscription
receiver = servicebus_client.get_subscription_receiver(
topic_name=topic_name, subscription_name=subscription_name, )
with receiver:
for msg in receiver:
print("Received: " + str(msg))
# complete the message so that the message is removed from the subscription
receiver.complete_message(msg)
Since the messages are sent individually, you can process them individually. There is no need to concatenate into a string. Just keep appending them into a data frame. Below sample is for a queue but you can extend to a topic/subscription. I've also attached the results to show you what the output looks like.
from azure.servicebus import ServiceBusClient
import pandas as pd
import json
from pandas import json_normalize
CONNECTION_STR = 'Endpoint=sb://xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
QUEUE_NAME = 'xxxxxxxxxxx'
servicebus_client = ServiceBusClient.from_connection_string(conn_str=CONNECTION_STR)
with servicebus_client:
receiver = servicebus_client.get_queue_receiver(queue_name=QUEUE_NAME)
# create an Empty DataFrame object
df = pd.DataFrame()
msg_concat = ""
dfs = []
with receiver:
received_msgs = receiver.receive_messages(max_message_count=10, max_wait_time=5)
for msg in received_msgs:
msg_dict = json.loads(str(msg))
df2 = json_normalize(msg_dict)
df = df.append(df2, ignore_index = True)
receiver.complete_message(msg)
print(df)
print("Receive is done.")
Name Seri SiName As ... Id Asse id t
0 21000000 ... 0 None 075f0a38-2816-42c7-b95c-66c425b8ba9d -1
1 4560000 ... 0 None 075f0a38-2816-42c7-b95c-66c425b8ba9d -2
[2 rows x 21 columns]
Receive is done.
Consider sample data with three rows:
data = '{"Name": "Hassan", "code":"12"}{"Name": "Jack", "code":"345"}{"Name": "Jack", "code":"345"}'
Here is how you can get dataframe from this data:
from ast import literal_eval
data = [literal_eval(d + '}')for d in data.split('}')[0:-1]]
df = pd.DataFrame.from_records(data)
Output:
Name code
0 Hassan 12
1 Jack 345
2 Jack 345

How to iterate over dataframe rows for individual API calls

I'm trying to set up a loop to pull in weather data for about 500 weather stations for an entire year which I have in my dataframe. The base URL stays the same, and the only part that changes is the weather station ID.
I'd like to create a dataframe with the results. I believe i'd use requests.get to pull in data for all the weather stations in my list, which the IDs to use in the URL are in a column called "API ID" in my dataframe. I am a python beginner - so any help would be appreciated! My code is below but doesn't work and returns an error:
"InvalidSchema: No connection adapters were found for '0 " http://www.ncei.noaa.gov/access/services/data/...\nName: API ID, Length: 497, dtype: object'
.
def callAPI(API_id):
for IDs in range(len(API_id)):
url = ('http://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&dataTypes=PRCP,SNOW,TMAX,TMIN&stations=' + distances['API ID'] + '&startDate=2020-01-01&endDate=2020-12-31&includeAttributes=0&includeStationName=true&units=standard&format=json')
r = requests.request('GET', url)
d = r.json()
ll = []
for index1,rows1 in distances.iterrows():
station = rows1['Closest Station']
API_id = rows1['API ID']
data = callAPI(API_id)
ll.append([(data)])
I am not sure about your whole code base, but this is the function that will return the data from the API, If you have multiple station id on a single df column then you can use a for loop otherwise no need to do that.
Also, you are not returning the result from the function. Check the return keyword at the end of the function.
Working code:
import requests
def callAPI(API_id):
url = ('http://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&dataTypes=PRCP,SNOW,TMAX,TMIN&stations=' + API_id + '&startDate=2020-01-01&endDate=2020-12-31&includeAttributes=0&includeStationName=true&units=standard&format=json')
r = requests.request('GET', url)
d = r.json()
return d
print(callAPI('USC00457180'))
So your full code will be something like this,
def callAPI(API_id):
url = ('http://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&dataTypes=PRCP,SNOW,TMAX,TMIN&stations=' + API_id + '&startDate=2020-01-01&endDate=2020-12-31&includeAttributes=0&includeStationName=true&units=standard&format=json')
r = requests.request('GET', url)
d = r.json()
return d
ll = []
for index1,rows1 in distances.iterrows():
station = rows1['Closest Station']
API_id = rows1['API ID']
data = callAPI(API_id)
ll.append([(data)])
Note: Even better use asynchronous calls to the API to make the process faster. Something like this: https://stackoverflow.com/a/56926297/1138192

Loop and add function component as index

I would like to change the index of the following code. Instead of having 'close' as the index, I want to have the corresponding x from the function. As sometimes like in this example even if i provide 4 curr only 3 are available. Meaning that I cannot add the list as the index after looping as the size changes. Thank you for your help. I should add that even with the set_index(x) the index remain 'close'.
The function daily_price_historical retrieve prices from a public API . There are exactly 7 columns from which I select the the first one (close).
The function:
def daily_price_historical(symbol, comparison_symbol, all_data=False, limit=1, aggregate=1, exchange=''):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
if exchange:
url += '&e={}'.format(exchange)
if all_data:
url += '&allData=true'
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df.drop(df.index[-1], inplace=True)
return df
The code:
curr = ['1WO', 'ABX','ADH', 'ALX']
d_price = []
for x in curr:
try:
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close).set_index(x)
except:
pass
d_price = pd.concat(d_price, axis=1)
d_price = d_price.transpose()
print(d_price)
The output:
0
close 2.6100
close 0.3360
close 0.4843
The function daily_price_historical returns a dataframe, so daily_price_historical(x, 'JPY', exchange='CCCAGG').close is a pandas Series. The title of a Series is its name, but you can change it with rename. So you want:
...
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close.rename(x))
...
In your original code, d_price.append(close).set_index(x) raised a AttributeError: 'NoneType' object has no attribute 'set_index' exception because append on a list returns None but the exception was raised after the append and was silently swallowed by the catchall except: pass.
What to remember from that: never use the very dangerous :
try:
...
except:
pass
which hides any error.
Try this small code
import pandas as pd
import requests
curr = ['1WO', 'ABX','ADH', 'ALX']
def daily_price_historical(symbol, comparison_symbol, all_data=False, limit=1, aggregate=1, exchange=''):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
if exchange:
url += '&e={}'.format(exchange)
if all_data:
url += '&allData=true'
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df.drop(df.index[-1], inplace=True)
return df
d_price = []
lables_ind = []
for idx, x in enumerate(curr):
try:
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close[0])
lables_ind.append(x)
except:
pass
d_price = pd.DataFrame(d_price,columns=["0"])
d_price.index = lables_ind
print(d_price)
Output
0
1WO 2.6100
ADH 0.3360
ALX 0.4843

Categories