Additional columns added to saved CSV - python

I have following code which generate features from csv
def gen_features_per_id(file_name, label):
df = pd.read_csv(file_name, delimiter=',')
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
row = []
column_names = ['group_timestamp', 'label',
'x_mean', 'x_median', 'x_stdev', 'x_raw_min', 'x_raw_max', 'x_abs_min', 'x_abs_max',
'y_mean', 'y_median', 'y_stdev', 'y_raw_min', 'y_raw_max', 'y_abs_min', 'y_abs_max',
'z_mean', 'z_median', 'z_stdev', 'z_raw_min', 'z_raw_max', 'z_abs_min', 'z_abs_max' ]
group_df = pd.DataFrame(columns=column_names)
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
x_median = x.median()
x_std_dev = statistics.stdev(x)
x_raw_min = min(x)
x_raw_max = max(x)
x_abs_min = min(abs(x))
x_abs_max = max(abs(x))
# print(
# f'Mean : {x_mean}, Median : {x_median}, Stdev : {x_std_dev}, '
# f'X raw Min : {x_raw_min}, X raw Max : {x_raw_max}, '
# f'X abs Min : {x_abs_min}, X abs Max : {x_abs_max}'
# )
y = g['y'].head(50)
y_mean = y.mean()
y_median = y.median()
y_std_dev = statistics.stdev(y)
y_raw_min = min(y)
y_raw_max = max(y)
y_abs_min = min(abs(y))
y_abs_max = max(abs(y))
# print(
# f'Mean : {y_mean}, Median : {y_median}, Std dev : {y_std_dev}, '
# f'X raw Min : {y_raw_min}, X raw Max : {y_raw_max}, '
# f'X abs Min : {y_abs_min}, X abs Max : {y_abs_max}'
# )
z = g['z'].head(50)
z_mean = z.mean()
z_median = z.median()
z_std_dev = statistics.stdev(z)
z_raw_min = min(z)
z_raw_max = max(z)
z_abs_min = min(abs(z))
z_abs_max = max(abs(z))
# print(
# f'Mean : {z_mean}, Median : {z_median}, Std dev : {z_std_dev}, '
# f'X raw Min : {z_raw_min}, X raw Max : {z_raw_max}, '
# f'X abs Min : {z_abs_min}, X abs Max : {z_abs_max}'
# )
row.append(group_timestamp)
row.append(label)
row.append(x_mean)
row.append(x_median)
row.append(x_std_dev)
row.append(x_raw_min)
row.append(x_raw_max)
row.append(x_abs_min)
row.append(x_abs_max)
row.append(y_mean)
row.append(y_median)
row.append(y_std_dev)
row.append(y_raw_min)
row.append(y_raw_max)
row.append(y_abs_min)
row.append(y_abs_max)
row.append(z_mean)
row.append(z_median)
row.append(z_std_dev)
row.append(z_raw_min)
row.append(z_raw_max)
row.append(z_abs_min)
row.append(z_abs_max)
group_df = group_df.append([row], ignore_index=True)
group_df.to_csv("some.csv", index=False)
row = []
But saved csv file have additional columns added to the start of the csv header which is equal to supplied number of columns
Sample CSV
datetime,x,y,z,label
1493740845,0.0004,-0.0001,0.0045,bad
1493740846,0.0003,0.0002,0.0047,bad
1493740847,0.0005,0.0001,0.0049,bad
1493740848,0.0006,0.0004,0.005,bad
1493740849,0.0006,-0.0003,0.005,bad
1493740851,0.0001,-0.0003,0.0039,bad
1493740852,-0.0006,0.0003,0.0046,bad
B1493740853,0.0001,0.0,0.0048,bad
Output:
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,group_timestamp,label,x_abs_max,x_abs_min,x_mean,x_median,x_raw_max,x_raw_min,x_stdev,y_abs_max,y_abs_min,y_mean,y_median,y_raw_max,y_raw_min,y_stdev,z_abs_max,z_abs_min,z_mean,z_median,z_raw_max,z_raw_min,z_stdev
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
How to fix this?
Additionally : If you can help me to simplify the code more.

There is problem for each loop in groupby is necessary append values to row list and then append to rows outside loop for nested lists, so possible pass to DataFrame cosntructor in last step:
#added for nested lists (outside loops)
rows = []
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
#added for row per loop
row = []
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
....
row.append(z_abs_max)
rows.append(row)
#DataFrame outside loops
group_df = pd.DataFrame(rows, columns=column_names)
print (group_df)
Your solution should be improved by GroupBy.agg:
#custom aggregate functions
def std_dev(x):
return statistics.stdev(x)
def abs_min(x):
return x.abs().min()
def abs_max(x):
return x.abs().max()
d = ['mean','median',std_dev, 'min','max', abs_min, abs_max]
cols = ['x','y','z']
#filtered first 50 rows
df[cols] = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].head(50)
#aggregate functions
group_df = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].agg(d)
group_df.columns = group_df.columns.map('_'.join)
print (group_df)

Related

Rank the row based on the similar text using python?

How to rank the data frame based on the row value. i.e I have a row that contains text data want to provide the rank based on the similarity?
Expected output
i have tried with the levistian distance but not sure how can i do for the whole table
def bow(x=None):
x = x.lower()
words = x.split(' ')
words.sort()
x = ' '.join(words)
exclude = set('{}{}'.format(string.punctuation, string.digits))
x = ''.join(ch for ch in x if ch not in exclude)
x = '{} '.format(x.strip())
return x
#intents = load_intents(export=True)
df['bow'] = df['name'].apply(lambda x: bow(x))
df.sort_values(by='bow',ascending=True,inplace=True)
last_bow = ''
recs = []
for idx,row in df.iterrows():
record = {
'name': row['name'],
'bow': row['bow'],
'lev_distance': ed.eval(last_bow,row['bow'])
}
recs.append(record)
last_bow = row['bow']
intents = pd.DataFrame(recs,columns=['name','bow','lev_distance'])
l = intents[intents['lev_distance'] <= lev_distance_range]
r = []
for x in l.index.values:
r.append(x - 1)
r.append(x)
r = list(set(r))
l = intents.iloc[r,:]
Using textdistance, you could try this:
import pandas as pd
import textdistance
df = pd.DataFrame(
{
"text": [
"Rahul dsa",
"Rasul dsad",
"Raul ascs",
"shrez",
"Indya",
"Indi",
"shez",
"india",
"kloa",
"klsnsd",
],
}
)
df = (
df
.assign(
match=df["text"].map(
lambda x: [
i
for i, text in enumerate(df["text"])
if textdistance.jaro_winkler(x, text) >= 0.9
]
)
)
.sort_values(by="match")
.drop(columns="match")
)
print(df)
# Output
text
0 Rahul dsa
1 Rasul dsad
2 Raul ascs
3 shrez
6 shez
4 Indya
5 Indi
7 india
8 kloa
9 klsnsd

How to groupby specifically datetime index in a multiindex column by month

The data frame shows the date with the amount of import and export
and it is further bifurcated into coastal and regional data per day
of one month.
What I wish to achieve is to club i.e sum all the data presented, which is of one month in this
case, in the end, it will show only one entry that will be of month
ending date and adding all the corresponding fields.
This is the following code:
df=pd.read_csv('output.csv',
encoding="utf-8",skipinitialspace=True,engine='python')
datadf = df
datadf = datadf.dropna(axis = 0, how ='any')
datadf = datadf.astype({'ForeignType' : 'category','ImportType' : 'category','ArrDate' : 'datetime64',
'DepDate' : 'datetime64'})
# datadf = datadf.groupby(datadf['ArrDate'].dt.strftime('%B'))['ComoQty'].sum()
datadf1 = datadf.groupby(['ArrDate','ImportType','ForeignType'])['ComoQty'].sum()
datadf2 = datadf1.to_frame()
datadf2.fillna(value=0,inplace=True)
# datadf2 = datadf2.reset_index('ImportType')
# datadf2 = datadf2.reset_index('ForeignType')
# datadf2 = datadf2.reset_index('ArrDate')
datadf2
datadf1 = datadf.drop(columns='Unnamed: 0')
prac = datadf1
prac =prac.set_index('ArrDate')
prac_dates = prac.copy()
prac = prac.resample('D').apply({'ShipName':'count','ComoQty':'sum'}).reset_index()
prac_dates = ((prac_dates.resample('M').apply({'ComoQty':'sum'}))/1000).reset_index()
prac_dates['Month'] = pd.DatetimeIndex(prac_dates['ArrDate']).strftime('%B')
del prac_dates['ArrDate']
# prac_dates
prac['Month'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%B')
# prac['Month'] = pd.to_datetime(prac['Month'], format='%B')
prac['ArrDate'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%d')

How to improve my pandas efficiency when there is many selections

I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs.
I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe.
The problem is it is very very slow, and become slower while running.
The mainly code is:
def process_df(df, func, *args, **kwargs):
'''
'''
product_ids = df.product_id.unique()
store_ids = df.store_id.unique()
# uk = df.drop_duplicates(subset=['store_id','product_id'])
# for idx, item in uk.iterrows():
all_df = list()
i = 1
with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t:
for store_id in store_ids:
sdf = df.loc[df['store_id']==store_id]
for product_id in product_ids:
new_df = sdf.loc[(sdf['product_id']==product_id) ]
if new_df.shape[0] < 14:
continue
new_df = func(new_df, *args, **kwargs)
new_df.loc[:, 'store_id'] = store_id
new_df.loc[:, 'product_id'] = product_id
all_df.append(new_df)
t.update()
all_df= pd.concat(all_df)
return all_df
def process_order_items(df, store_id=None, product_id=None, freq='D'):
if store_id and "store_id" in df.columns:
df = df.loc[df['store_id']==store_id]
if product_id and "product_id" in df.columns:
df = df.loc[df['product_id']==product_id]
# convert to datetime
df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]]
df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int)
df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
if not freq:
df.loc[:, "date_create"] = df["datetime_create"]
else:
assert freq in ('D', 'H')
df.index = df.loc[:, "datetime_create"]
discount_order_count = df['has_discount'].resample(freq).sum()
clearance_order_count = df['clearance'].resample(freq).sum()
discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum()
no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum()
no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum()
order_count = df['count'].resample(freq).count()
day_count = df['count'].resample(freq).sum()
price_guide = df['price_guide'].resample(freq).max()
price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count
df = pd.DataFrame({
"price":price_avg,
"price_guide": price_guide,
"sale_count": day_count,
"order_count": order_count,
"discount_order_count": discount_order_count,
"clearance_order_count": clearance_order_count,
"discount_sale_count": discount_sale_count,
"clearance_sale_count": clearance_sale_count,
})
df = df.drop(df[df.order_count == 0].index)
return df
I think the problem is there are too many redundant selections.
Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.
I think you can change:
df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
to Series.isin:
df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int)
Also solution for Resampler.aggregate:
d = {'has_discount':'sum',
'clearance':'sum',
'count': ['count', 'sum'],
'price_guide':'max'}
df1 = df.resample(freq).agg(d)
df1.columns = df1.columns.map('_'.join)
d1 = {'has_discount_count':'discount_order_count',
'clearance_count':'clearance_order_count',
'count_count':'order_count',
'count_sum':'day_count',
'price_guide_max':'price_guide'}
df1.rename(columns=d1)
Another idea is no convert boolean mask to integer, but use columns for filtering like:
df["has_discount"] = df.price_change_category > 0
df["clearance"] = df.price_change_category.isin([10, 20, 23])
discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum()
#for filtering ==0 invert boolean mask columns by ~
no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum()
no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum()
First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary:
def f(x):
print (x)
df = df.groupby(['product_id','store_id']).apply(f)

Aroon Indicator in PySpark: How to count the number of rows between the max value and the current value in each group

Arron Up, Aroon Dn, Aroon Oscillator
def myFunction(myData):
df = myData
col = 'Date'
groupCols = ['Name']
window=Window.partitionBy(groupCols).orderBy(df[col].asc()).rowsBetween(-11, 0)
max_value = max(df['value']).over(window)
min_value = min(df['value']).over(window)
df = df.withColumn('max', max_value)
df = df.withColumn('min', min_value)
I need to find the number of rows between the max value and the current value for each group
periods_since_max =
periods_since_min =
df = df.withColumn('periods_since_max', periods_since_max)
df = df.withColumn('periods_since_min', periods_since_min)
aroon_up = ((12.0 - df['periods_since_max'])/12.0) * 100
aroon_dn = ((12.0 - df['periods_since_min'])/12.0) * 100
df = df.withColumn('aroon_up', aroon_up)
df = df.withColumn('aroon_dn', aroon_dn)
oscillator = df['aroon_up'] - df['aroon_dn']
df = df.withColumn('oscillator', oscillator)
return df

I want to create a time series of monthly means in Pandas

I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()

Categories