I have a SQL query:
pd.read_sql_query("""SELECT UvTab.A, UvTab.Uv,
IFNULL(DvTab.Dv, 0) AS Dv
FROM
(
SELECT A, COUNT(*) AS Uv FROM B
WHERE Vtype = 2 GROUP BY A
) AS UvTab
LEFT JOIN
(
SELECT A, COUNT(*) AS Dv FROM B
WHERE Vtype = 3 GROUP BY A
) AS DvTab
ON UvTab.A = DvTab.A
""", conn)
And my goal is to get the same result but using only pandas' methods. What I obtained is:
UvTab = B.loc[B.Vtype == 2].groupby("A").size()
UvTab = pd.DataFrame({'A' : UvTab.index, 'Uv' : UvTab.values})
DvTab = B.loc[B.Vtype == 3].groupby("A").size()
DvTab = pd.DataFrame({'A' : DvTab.index, 'Dv' : DvTab.values})
df = pd.merge(UvTab, DvTab, how='left', on='A')
df['Dv'] = df['Dv'].fillna(0)
And it seems to be fine. But is this the simpliest and the best way to represent the query?
One idea is aggregate sum for count matching and then use DataFrame.join:
UvTab = (B.Vtype == 2).astype(int).groupby(B["A"]).sum().reset_index(name='Uv')
DvTab = (B.Vtype == 3).astype(int).groupby(B["A"]).sum().to_frame('Dv')
df = UvTab.join(DvTab, on='A').fillna({'DV':0})
Or alternative with merge:
UvTab = (B.Vtype == 2).astype(int).groupby(B["A"]).sum().reset_index(name='Uv')
DvTab = (B.Vtype == 3).astype(int).groupby(B["A"]).sum().reset_index(name='Dv')
df = UvTab.merge(DvTab, on='A', how='left').fillna({'DV':0})
Related
I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.
I have this code, i want to do the same for 3 different dataframes df1, df2, df3.
the column in df2 is 10_11 and in df3 IS 5_6
df1_30_31 = pd.DataFrame(df1_30_31['30_31'])
df2_10_11 = pd.DataFrame(df2_10_11['10_11'])
df3_5_6 = pd.DataFrame(df3_5_6['5_6'])
df1_30_31['30_31'] = df1_30_31['30_31'].replace(np.nan, ' ', regex=True)
df1_30_31['new_'] = df1_30_31['30_31'].apply(lambda x: '-' if 'nan' in x else x)
def Ok_fun(df):
if (df['new_'] =='T-S') or (df['new_'] =='B-S') or (df['new_'] =='B-M') or (df['new_'] =='M-L'):
val = 'NOK'
elif (df['new_']!= '-') & (df['new_']!= 'NOK'):
val ='OK'
else:
val= 'NOO'
return val
df1_30_31['new_'] = df1_30_31.apply(Ok_fun, axis=1)
df1_30_31['30_31'].replace(' ', np.nan, inplace=True)
df1_30_31.dropna(inplace=True)
Nbr_not_ok_30_31 = df1_30_31[df1_30_31["new_"]=='NOK']['new_'].count()
Nbr_ok_30_31 = df1_30_31[df1_30_31["new_"]=='OK']['OK_NOTOK'].count()
total_30_31= Nbr_not_ok_30_31+Nbr_ok_30_31
ratio_30_31 = (Nbr_not_ok_30_31/total_30_31)*100
Use a function that takes df, column as input and returns the processed df.
def change_df(df, col):
# ...
df[col] = df[col].replace(np.nan, ' ', regex=True)
# ...
return df
df1 = change_df(df=df1, col='30_31')
df2 = change_df(df=df2, col='10_11')
df3 = change_df(df=df3, col='5_6')
This is a snippet of my code. I need to clean up one day of older data. how do we do that for a dictionary of dataframes?
master_train_dict = {}
for id in list_of_id:
temp_df = df.loc[df["id"] == id].copy(deep=False)
temp_df.drop('id', axis=1, inplace=True)
temp_df.reset_index(drop=True, inplace=True)
alert_list = list(temp_df["title"])
train_embedding = get_embeddings(alert_list, model)
temp_df["train_embedding"] = train_embedding
master_train_dict[parent_id] =
temp_df[["title","train_embedding","#timestamp"]]
#master_train_dict[parent_id] = temp_df
global master_dict
master_dict = master_train_dict
print(master_dict)
#clean up function
if len(master_dict)>0:
d = datetime.today() - timedelta(hours=1, minutes= 0)
master_dict=master_dict[id]['#timestamp']>d.strftime("%Y-%m-%d %H:%M:%S")
print(master_dict)
Consider working in defined methods and use groupby for building list or dict of subsetted data frames. Then call functions via dictionary comprehensions.
def build_df(sub):
sub_df.drop('id', axis=1, inplace=True)
sub_df.reset_index(drop=True, inplace=True)
alert_list = list(sub_df["title"])
train_embedding = get_embeddings(alert_list, model)
sub_df["train_embedding"] = train_embedding
sub_df = sub_df.reindex(["title","train_embedding","#timestamp"], axis="columns")
return sub_df
master_train_dict = {i:build_df(g) for i, g in df.groupby(["id"])}
def clean_df(df):
d = datetime.today() - timedelta(hours=1, minutes= 0)
df = df[df['#timestamp'] > d.strftime("%Y-%m-%d %H:%M:%S")]
return df
clean_master_train_dict = {k:clean_df(v) for k, v in master_train_dict.items()}
I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs.
I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe.
The problem is it is very very slow, and become slower while running.
The mainly code is:
def process_df(df, func, *args, **kwargs):
'''
'''
product_ids = df.product_id.unique()
store_ids = df.store_id.unique()
# uk = df.drop_duplicates(subset=['store_id','product_id'])
# for idx, item in uk.iterrows():
all_df = list()
i = 1
with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t:
for store_id in store_ids:
sdf = df.loc[df['store_id']==store_id]
for product_id in product_ids:
new_df = sdf.loc[(sdf['product_id']==product_id) ]
if new_df.shape[0] < 14:
continue
new_df = func(new_df, *args, **kwargs)
new_df.loc[:, 'store_id'] = store_id
new_df.loc[:, 'product_id'] = product_id
all_df.append(new_df)
t.update()
all_df= pd.concat(all_df)
return all_df
def process_order_items(df, store_id=None, product_id=None, freq='D'):
if store_id and "store_id" in df.columns:
df = df.loc[df['store_id']==store_id]
if product_id and "product_id" in df.columns:
df = df.loc[df['product_id']==product_id]
# convert to datetime
df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]]
df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int)
df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
if not freq:
df.loc[:, "date_create"] = df["datetime_create"]
else:
assert freq in ('D', 'H')
df.index = df.loc[:, "datetime_create"]
discount_order_count = df['has_discount'].resample(freq).sum()
clearance_order_count = df['clearance'].resample(freq).sum()
discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum()
no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum()
no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum()
order_count = df['count'].resample(freq).count()
day_count = df['count'].resample(freq).sum()
price_guide = df['price_guide'].resample(freq).max()
price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count
df = pd.DataFrame({
"price":price_avg,
"price_guide": price_guide,
"sale_count": day_count,
"order_count": order_count,
"discount_order_count": discount_order_count,
"clearance_order_count": clearance_order_count,
"discount_sale_count": discount_sale_count,
"clearance_sale_count": clearance_sale_count,
})
df = df.drop(df[df.order_count == 0].index)
return df
I think the problem is there are too many redundant selections.
Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.
I think you can change:
df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
to Series.isin:
df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int)
Also solution for Resampler.aggregate:
d = {'has_discount':'sum',
'clearance':'sum',
'count': ['count', 'sum'],
'price_guide':'max'}
df1 = df.resample(freq).agg(d)
df1.columns = df1.columns.map('_'.join)
d1 = {'has_discount_count':'discount_order_count',
'clearance_count':'clearance_order_count',
'count_count':'order_count',
'count_sum':'day_count',
'price_guide_max':'price_guide'}
df1.rename(columns=d1)
Another idea is no convert boolean mask to integer, but use columns for filtering like:
df["has_discount"] = df.price_change_category > 0
df["clearance"] = df.price_change_category.isin([10, 20, 23])
discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum()
#for filtering ==0 invert boolean mask columns by ~
no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum()
no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum()
First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary:
def f(x):
print (x)
df = df.groupby(['product_id','store_id']).apply(f)
EDIT 2, 9/1 See my answer below!
Pretty new at Python and Pandas here. I've got a script here that uses a for loop to query my database using each line in my list. That all works great, but I can't figure out how to build a data frame from the results of that loop. Any and all pointers are welcome!
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
list1 = []
brands = my_query('prod', mysql_query)
print "Writing CSV..."
#Create DF and CSV
for row in brands:
list1.append({"URL":row['URL'],"Name":['Name'],"ID":['ID']})
if brands.shape == (3,0):
df1 = pd.DataFrame(data = brands, columns=['URL','Name','ID'])
output = df1.to_csv('ongoing.csv',index=False)
EDIT 8/30
Here is my edit, attempting to use zyxue's method:
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
brands = my_query('prod', mysql_query)
print "Writing CSV..."
#Create DF and CSV
records = []
for row in brands:
records.append({"URL":row['URL'],"Name":['Name'],"ID":['ID']})
if brands.shape == (3,0):
records.append(dict(zip(brands, ['URL', 'Name', 'ID'])))
df1 = pd.DataFrame.from_records(records)
output = df1.to_csv('ongoing.csv', index=False)
but this only returns a blank CSV. I'm sure I'm applying it wrong.
records = []
for row in brands:
# if brands.shape == (3,0):
# records.append(dict(zip(brands, ['URL', 'Name', 'ID'])))
# update bug fix:
if row.shape == (3,0):
records.append(dict(zip(row, ['URL', 'Name', 'ID'])))
df1 = pd.DataFrame.from_records(records)
output = df1.to_csv('ongoing.csv', index=False)
# ref:
# >>> pd.DataFrame.from_records([{'a': 1, 'b':2}, {'a': 11, 'b': 22}])
# a b
# 0 1 2
# 1 11 22
Okay, I figured it out, and I thought I should post the working script. #zyxue was pretty much right.
source = open('urls.txt')
key_values = ("http://",""), ("https://",""), ("www.",""), ("\n","")
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
records = []
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
brands = my_query('prod', mysql_query)
#Append results to dict (records)
for row in brands:
records.append({"URL":row['URL'],"Name":row['Name'],"ID":row['ID']})
#Create DataFrame
df = pd.DataFrame.from_dict(records)
#Create CSV
output = df.to_csv('ongoing.csv',index=False)
Essentially, I needed to layer the second for loop under the first and create the 'records' dictionary before the looping began. This causes an append to the dictionary for every line in 'source'. Seems like a pretty simple concept now!