Add new column to DataFrame with same default value

Add new column to DataFrame with same default value - python

I would like to add a name column based on the 'lNames' list. But my code is overwriting the whole column in the last iteration as follows:
import pandas as pd
def consulta_bc(codigo_bcb):
url = 'http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(codigo_bcb)
df = pd.read_json(url)
df['data'] = pd.to_datetime(df['data'], dayfirst=True)
df.set_index('data', inplace=True)
return df
lCodigos = [12, 11, 1, 21619, 21623, 12466]
lNames = ['CDI', 'SELIC', 'USD', 'EUR', 'GPB', 'IMAB']
iter_len = len(lCodigos)
saida = pd.DataFrame()
for i in range(iter_len):
saida = saida.append(consulta_bc(lCodigos[i]))
saida['nome']= lNames[i]
saida.to_csv('Indice', sep=';', index=True)
saida
Any help will be fully appreciated

Change the for loop in this way:
for i in range(iter_len):
df = consulta_bc(lCodigos[i])
df['nome'] = lNames[i]
saida = saida.append(df)

Related

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')

You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.

Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

How to improve my pandas efficiency when there is many selections

I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs.
I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe.
The problem is it is very very slow, and become slower while running.
The mainly code is:
def process_df(df, func, *args, **kwargs):
'''
'''
product_ids = df.product_id.unique()
store_ids = df.store_id.unique()
# uk = df.drop_duplicates(subset=['store_id','product_id'])
# for idx, item in uk.iterrows():
all_df = list()
i = 1
with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t:
for store_id in store_ids:
sdf = df.loc[df['store_id']==store_id]
for product_id in product_ids:
new_df = sdf.loc[(sdf['product_id']==product_id) ]
if new_df.shape[0] < 14:
continue
new_df = func(new_df, *args, **kwargs)
new_df.loc[:, 'store_id'] = store_id
new_df.loc[:, 'product_id'] = product_id
all_df.append(new_df)
t.update()
all_df= pd.concat(all_df)
return all_df
def process_order_items(df, store_id=None, product_id=None, freq='D'):
if store_id and "store_id" in df.columns:
df = df.loc[df['store_id']==store_id]
if product_id and "product_id" in df.columns:
df = df.loc[df['product_id']==product_id]
# convert to datetime
df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]]
df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int)
df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
if not freq:
df.loc[:, "date_create"] = df["datetime_create"]
else:
assert freq in ('D', 'H')
df.index = df.loc[:, "datetime_create"]
discount_order_count = df['has_discount'].resample(freq).sum()
clearance_order_count = df['clearance'].resample(freq).sum()
discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum()
no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum()
no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum()
order_count = df['count'].resample(freq).count()
day_count = df['count'].resample(freq).sum()
price_guide = df['price_guide'].resample(freq).max()
price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count
df = pd.DataFrame({
"price":price_avg,
"price_guide": price_guide,
"sale_count": day_count,
"order_count": order_count,
"discount_order_count": discount_order_count,
"clearance_order_count": clearance_order_count,
"discount_sale_count": discount_sale_count,
"clearance_sale_count": clearance_sale_count,
})
df = df.drop(df[df.order_count == 0].index)
return df
I think the problem is there are too many redundant selections.
Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.

I think you can change:
df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
to Series.isin:
df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int)
Also solution for Resampler.aggregate:
d = {'has_discount':'sum',
'clearance':'sum',
'count': ['count', 'sum'],
'price_guide':'max'}
df1 = df.resample(freq).agg(d)
df1.columns = df1.columns.map('_'.join)
d1 = {'has_discount_count':'discount_order_count',
'clearance_count':'clearance_order_count',
'count_count':'order_count',
'count_sum':'day_count',
'price_guide_max':'price_guide'}
df1.rename(columns=d1)
Another idea is no convert boolean mask to integer, but use columns for filtering like:
df["has_discount"] = df.price_change_category > 0
df["clearance"] = df.price_change_category.isin([10, 20, 23])
discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum()
#for filtering ==0 invert boolean mask columns by ~
no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum()
no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum()
First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary:
def f(x):
print (x)
df = df.groupby(['product_id','store_id']).apply(f)

How to run a function on each row in DataFrame and append the result to a new DataFrame

NB My code runs if copied
I wrote a simple script to backtest cryptocurrencies using the poloniex API.
First I request the data from the API and turn it into a dataframe data.
Then I take the data I want and make new df called df
A function trade must then be run on each line in df, simple put if the price is above the rolling mean it buys and sells if below, this data is then saved in log.
I am having trouble applying this function on each row in df.
I had great success using the line log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1) BUT surprising it works when BTC_ETH is used in the API call and not for others ie BTC_FCT or BTC_DOGE despite the data being identical in form. Using ETH results in the creation of DataFrame (which is what i want) DOGE and FCT creates a Series
First question, how can I run my trade function on each row and create a new df log with the results
Bonus question, even though the data types are the same why does it work for ETH but not for DOGE/FCT ?
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
print()
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
EDIT:
I solved the problem, I fixed it by appending the dicts to list and then using the df.from_dict() method to create the log dataframe, my code just to clarify.
def trade(date, close, MA):#, port):
#d = {'Data': close}
#test_log = test_log.append(d, ignore_index=True)
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
for key,value in port.items():
print(key, value )
log.from_dict(data_list)

The problem is that you are not always returning a value in trade, which is confusing Pandas. Try this:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
d = {'Date': date, 'type':'', 'coin_value': np.nan, 'btc_value': np.nan}
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d['type'] = 'buy'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d['type'] = 'sell'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
However, as I mentioned in the comment, passing a function with side-effects to apply is not a good idea according to the documentation, and in fact I think it may not produce the correct result in your case.

I want to create a time series of monthly means in Pandas

I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']

you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Add new column to DataFrame with same default value - python

Change the for loop in this way: for i in range(iter_len): df = consulta_bc(lCodigos[i]) df['nome'] = lNames[i] saida = saida.append(df)

Related

Pandas how to search one df for a certain date and return that data

Webscraping data from a json source, why i get only 1 row?

How to improve my pandas efficiency when there is many selections

How to run a function on each row in DataFrame and append the result to a new DataFrame

I want to create a time series of monthly means in Pandas

Categories

Resources