Extract data from array - Python - python

I am using the unleashed_py library to extract Unleashed data.
The sample of the output is as below where there could be several items in the invoice:
[{
'OrderNumber': 'SO-00000742',
'QuoteNumber': None,
'InvoiceDate': '/Date(1658496322067)/',
'InvoiceLines': [{'LineNumber': 1,
'LineType': None},
{'LineNumber': 2,
'LineType': None}],
'Guid': '8f6b89da-1e6e-42288a24-902a-038041e04f06',
'LastModifiedOn': '/Date(1658496322221)/'}]
I need to get a df:
If I run the below script, the invoice lines just get appended with the common fields such as ordernumber, quotenumber, invoicedate, guide, and lastmodifiedon not getting repeated.
order_number = []
quote_number = []
invoice_date = []
invoicelines = []
invoice_line_number = []
invoice_line_type = []
guid = []
last_modified = []
for item in df:
order_number.append(item.get('OrderNumber'))
quote_number.append(item.get('QuoteNumber'))
invoice_date.append(item.get('InvoiceDate'))
guid.append(item.get('Guid'))
last_modified.append(item.get('LastModifiedOn'))
lines = item.get('InvoiceLines')
for item_sub_2 in lines:
invoice_line_number.append('LineNumber')
invoice_line_type.append('LineType')
df_order_number = pd.DataFrame(order_number)
df_quote_number = pd.DataFrame(quote_number)
df_invoice_date = pd.DataFrame(invoice_date)
df_invoice_line_number = pd.DataFrame(invoice_line_number)
df_invoice_line_type = pd.DataFrame(invoice_line_type)
df_guid = pd.DataFrame(guid)
df_last_modified = pd.DataFrame(last_modified)
df_row = pd.concat([
df_order_number,
df_quote_number,
df_invoice_date,
df_invoice_line_number,
df_invoice_line_type,
df_guid,
df_last_modified
], axis = 1)
What am I doing wrong?

You don't need to iterate, just create the dataframe out of the list of dictionaries you have, then explode InvoiceLines columns then apply pd.Series and join it with the original dataframe:
data = [{
'OrderNumber': 'SO-00000742',
'QuoteNumber': None,
'InvoiceDate': '/Date(1658496322067)/',
'InvoiceLines': [{'LineNumber': 1,
'LineType': None},
{'LineNumber': 2,
'LineType': None}],
'Guid': '8f6b89da-1e6e-42288a24-902a-038041e04f06',
'LastModifiedOn': '/Date(1658496322221)/'}]
df=pd.DataFrame(data).explode('InvoiceLines')
out=pd.concat([df['InvoiceLines'].apply(pd.Series),
df.drop(columns=['InvoiceLines'])],
axis=1)
OUTPUT:
#out
LineNumber LineType OrderNumber QuoteNumber InvoiceDate \
0 1.0 NaN SO-00000742 None /Date(1658496322067)/
0 2.0 NaN SO-00000742 None /Date(1658496322067)/
Guid LastModifiedOn
0 8f6b89da-1e6e-42288a24-902a-038041e04f06 /Date(1658496322221)/
0 8f6b89da-1e6e-42288a24-902a-038041e04f06 /Date(1658496322221)/
I'm leaving the date conversion and column renames for you cause I believe you can do that yourself.

Related

cannot concatenate object of type '<class 'list'>' when convering from df.append to pd.concat

I have a little parser that is gathering RSS feed channel to pandas df. Everything works as expected but I get this waring
The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead
After some research, I converted my dicts to list and then started to concatenate but now I get the
type '<class 'list'>'; only Series and DataFrame objs are valid
how to rewrite my for loop to get expected result
working code with warning
df = pd.DataFrame(columns = ['title', 'link'])
with response as r:
items = r.html.find('item', first=False)
for item in items:
title = item.find('title', first=True).text
link = item.find('guid', first=True).text
row = {'title': title, 'link': link}
df = df.append(row, ignore_index=True)
slightly modified, gives error
df = pd.DataFrame(columns = ['title', 'link'])
tmp = []
with response as r:
items = r.html.find('item', first=False)
for item in items:
title = item.find('title', first=True).text
link = item.find('guid', first=True).text
row = [title, link]
tmp.append(row)
df = pd.concat(tmp)
You can use pd.concat() for dataframes. You just need the create your dataframe with the tmp list. Maybe you can get data with pd.read_html I don't know actually.
tmp = []
with response as r:
items = r.html.find('item', first=False)
for item in items:
title = item.find('title', first=True).text
link = item.find('guid', first=True).text
row = [title, link]
tmp.append(row)
df = pd.DataFrame(tmp, columns=['title', 'link'])
pd.concat works to concatenate two or more pandas objects.
If you have succesfully constructed a list of dicts containing your data (which you have in the tmp variable) then you can transform it into a dataframe just by using the default pd.DataFrame constructor:
df = pd.DataFrame(columns = ['title', 'link'])
tmp = []
with response as r:
items = r.html.find('item', first=False)
for item in items:
title = item.find('title', first=True).text
link = item.find('guid', first=True).text
row = {'title': title, 'link': link}
tmp.append(row)
df = pd.DataFrame(tmp)
You need to change row to dict, e.g.:
row = {'col1': [title], 'col2': [link]}
and the append line to:
tmp = tmp.append(pd.DataFrame(row))
don't forget to reset the tmp to dataframe:
tmp = pd.DataFrame()

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

Read Excel file without using Pandas and add new columns and print and output file

I am new to python coding and due to some issue, I need to reconfigure my code without pandas.
I am reading an Excel and extracting a few columns with filtered values. Then passing the one column value to a function to fetch the results. The result comes back in a complex dictionary format then I have to create a new column from the dictionary then join the two outputs (initial Excel file and complex dictionary) and print that back in the output file.
So my data is
Customer Customer Name Serial Number
1 XYZ 101011
2 XYZ 1020123
3 XYX 102344
Dictionary output
[{'cert': {'alternate_names': [],
'created_on': '2017-09-10T16:15:25.7599734Z',
'csr_used': False,
'error_details': '',
'revocation_date': None,
'revocation_status': None,
'serial_no': '101011',
'status': 'Expired',
'valid_to': '2020-09-09T23:59:59.0000000Z'},
'meta': {'api_application_biz_unit': '',
'api_client_nuid': '',
'asset_name': '',
'audience': 'External',
'automation_utility': '',
'delegate_owner': '',
'environment': 'Development',
'l2_group_email': None,
'l3_group_email': None,
'requestor_email': '',
'support_email': '',
'tech_delegate_email': None,
'tech_owner_email': None}}]
Desired output:
Customer Customer Name Serial Number Alternate_name Audience Environment
1 XYZ 101011 [] External Dev
My Code:
def create_excel(filename):
data = pd.read_excel(filename, usecols=[4,18,19,20,26,27,28])
data["Customer Name"].fillna("N/A",inplace= True)
df = data[data['Customer Name'].str.contains("XYZ",case = False)]
output = df['Serial Number'].apply(lambda x: fetch_by_ser_no(x))
df2 = pd.DataFrame(output)
df2.columns = ['Output']
df5 = pd.concat([df,df2],axis = 1)
df3 = pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df2['Output']],
ignore_index=False)
df3["Serial Number"] = df3.iloc[:,11]
df4 = pd.merge(left = df5, right = df3, how = 'left',
left_on = df5["Serial Number"].str.lower(),
right_on = df3["Serial Number"].str.lower())
df4.fillna("N/A",inplace = True)
df4["Status"] = df4.iloc[:,21].replace({"N/A":"Cust Not Found"},inplace = True)
df4["Status"] = df4.iloc[:,21]
df4["Serial Number"] = df4.iloc[:,4]
df4["Audience"] = df4.iloc[:,30]
df4["Environment"] = df4.iloc[:,33]
df4[["Customer","Customer Name","Serial Number","Common Name","Status",
"Environment","Audience"]].to_excel(r'Data.xlsx', index = False)
I want to remove the pandas dependency from the code. I am having a hard time figuring this out.

Python 3.7 KeyError

I like to retrieve information from NewsApi and ran into an issue. Enclosed the code:
from NewsApi import NewsApi
import pandas as pd
import os
import datetime as dt
from datetime import date
def CreateDF(JsonArray,columns):
dfData = pd.DataFrame()
for item in JsonArray:
itemStruct = {}
for cunColumn in columns:
itemStruct[cunColumn] = item[cunColumn]
# dfData = dfData.append(itemStruct,ignore_index=True)
# dfData = dfData.append({'id': item['id'], 'name': item['name'], 'description': item['description']},
# ignore_index=True)
# return dfData
return itemStruct
def main():
# access_token_NewsAPI.txt must contain your personal access token
with open("access_token_NewsAPI.txt", "r") as f:
myKey = f.read()[:-1]
#myKey = 'a847cee6cc254d8495632f83d5c77d39'
api = NewsApi(myKey)
# get sources of news
# columns = ['id', 'name', 'description']
# rst_source = api.GetSources()
# df = CreateDF(rst_source['sources'], columns)
# df.to_csv('source_list.csv')
#
#
# # get news for specific country
# rst_country = api.GetHeadlines()
# columns = ['author', 'publishedAt', 'title', 'description','content', 'url']
# df = CreateDF(rst_country['articles'], columns)
# df.to_csv('Headlines_country.csv')
# get news for specific symbol
symbol = "coronavirus"
sources = 'bbc.co.uk'
columns = ['author', 'publishedAt', 'title', 'description', 'content', 'source']
limit = 500 # maximum requests per day
i = 1
startDate = dt.datetime(2020, 3, 1, 8)
# startDate = dt.datetime(2020, 3, 1)
df = pd.DataFrame({'author': [], 'publishedAt': [], 'title': [], 'description': [], 'content':[], 'source': []})
while i < limit:
endDate = startDate + dt.timedelta(hours=2)
rst_symbol = api.GetEverything(symbol, 'en', startDate, endDate, sources)
rst = CreateDF(rst_symbol['articles'], columns)
df = df.append(rst, ignore_index=True)
# DF.join(df.set_index('publishedAt'), on='publishedAt')
startDate = endDate
i += 1
df.to_csv('Headlines_symbol.csv')
main()
I got following error:
rst = CreateDF(rst_symbol['articles'], columns)
KeyError: 'articles'
In this line:
rst = CreateDF(rst_symbol['articles'], columns)
I think there is some problem regarding the key not being found or defined - does anyone has an idea how to fix that? I'm thankful for every hint!
MAiniak
EDIT:
I found the solution after I tried a few of your hints. Apparently, the error occurred when the NewsAPI API key ran into a request limit. This happened every time, until I changed the limit = 500 to limit = 20. For some reason, there is no error with a new API Key and reduced limit.
Thanks for your help guys!
Probably 'articles' is not one of your columns in rst_symbol object.
The python documentation [2] [3] doesn't mention any method named NewsApi() or GetEverything(), but rather NewsApiClient() and get_everything(), i.e.:
from newsapi import NewsApiClient
# Init
newsapi = NewsApiClient(api_key='xxx')
# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(q='bitcoin',
sources='bbc-news,the-verge',
category='business',
language='en',
country='us')
# /v2/everything
all_articles = newsapi.get_everything(q='bitcoin',
sources='bbc-news,the-verge',
domains='bbc.co.uk,techcrunch.com',
from_param='2017-12-01',
to='2017-12-12',
language='en',
sort_by='relevancy',
page=2)
# /v2/sources
sources = newsapi.get_sources()

How to run a function on each row in DataFrame and append the result to a new DataFrame

NB My code runs if copied
I wrote a simple script to backtest cryptocurrencies using the poloniex API.
First I request the data from the API and turn it into a dataframe data.
Then I take the data I want and make new df called df
A function trade must then be run on each line in df, simple put if the price is above the rolling mean it buys and sells if below, this data is then saved in log.
I am having trouble applying this function on each row in df.
I had great success using the line log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1) BUT surprising it works when BTC_ETH is used in the API call and not for others ie BTC_FCT or BTC_DOGE despite the data being identical in form. Using ETH results in the creation of DataFrame (which is what i want) DOGE and FCT creates a Series
First question, how can I run my trade function on each row and create a new df log with the results
Bonus question, even though the data types are the same why does it work for ETH but not for DOGE/FCT ?
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
print()
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
EDIT:
I solved the problem, I fixed it by appending the dicts to list and then using the df.from_dict() method to create the log dataframe, my code just to clarify.
def trade(date, close, MA):#, port):
#d = {'Data': close}
#test_log = test_log.append(d, ignore_index=True)
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
for key,value in port.items():
print(key, value )
log.from_dict(data_list)
The problem is that you are not always returning a value in trade, which is confusing Pandas. Try this:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
d = {'Date': date, 'type':'', 'coin_value': np.nan, 'btc_value': np.nan}
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d['type'] = 'buy'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d['type'] = 'sell'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
However, as I mentioned in the comment, passing a function with side-effects to apply is not a good idea according to the documentation, and in fact I think it may not produce the correct result in your case.

Categories