Additional columns added to saved CSV - python
I have following code which generate features from csv
def gen_features_per_id(file_name, label):
df = pd.read_csv(file_name, delimiter=',')
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
row = []
column_names = ['group_timestamp', 'label',
'x_mean', 'x_median', 'x_stdev', 'x_raw_min', 'x_raw_max', 'x_abs_min', 'x_abs_max',
'y_mean', 'y_median', 'y_stdev', 'y_raw_min', 'y_raw_max', 'y_abs_min', 'y_abs_max',
'z_mean', 'z_median', 'z_stdev', 'z_raw_min', 'z_raw_max', 'z_abs_min', 'z_abs_max' ]
group_df = pd.DataFrame(columns=column_names)
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
x_median = x.median()
x_std_dev = statistics.stdev(x)
x_raw_min = min(x)
x_raw_max = max(x)
x_abs_min = min(abs(x))
x_abs_max = max(abs(x))
# print(
# f'Mean : {x_mean}, Median : {x_median}, Stdev : {x_std_dev}, '
# f'X raw Min : {x_raw_min}, X raw Max : {x_raw_max}, '
# f'X abs Min : {x_abs_min}, X abs Max : {x_abs_max}'
# )
y = g['y'].head(50)
y_mean = y.mean()
y_median = y.median()
y_std_dev = statistics.stdev(y)
y_raw_min = min(y)
y_raw_max = max(y)
y_abs_min = min(abs(y))
y_abs_max = max(abs(y))
# print(
# f'Mean : {y_mean}, Median : {y_median}, Std dev : {y_std_dev}, '
# f'X raw Min : {y_raw_min}, X raw Max : {y_raw_max}, '
# f'X abs Min : {y_abs_min}, X abs Max : {y_abs_max}'
# )
z = g['z'].head(50)
z_mean = z.mean()
z_median = z.median()
z_std_dev = statistics.stdev(z)
z_raw_min = min(z)
z_raw_max = max(z)
z_abs_min = min(abs(z))
z_abs_max = max(abs(z))
# print(
# f'Mean : {z_mean}, Median : {z_median}, Std dev : {z_std_dev}, '
# f'X raw Min : {z_raw_min}, X raw Max : {z_raw_max}, '
# f'X abs Min : {z_abs_min}, X abs Max : {z_abs_max}'
# )
row.append(group_timestamp)
row.append(label)
row.append(x_mean)
row.append(x_median)
row.append(x_std_dev)
row.append(x_raw_min)
row.append(x_raw_max)
row.append(x_abs_min)
row.append(x_abs_max)
row.append(y_mean)
row.append(y_median)
row.append(y_std_dev)
row.append(y_raw_min)
row.append(y_raw_max)
row.append(y_abs_min)
row.append(y_abs_max)
row.append(z_mean)
row.append(z_median)
row.append(z_std_dev)
row.append(z_raw_min)
row.append(z_raw_max)
row.append(z_abs_min)
row.append(z_abs_max)
group_df = group_df.append([row], ignore_index=True)
group_df.to_csv("some.csv", index=False)
row = []
But saved csv file have additional columns added to the start of the csv header which is equal to supplied number of columns
Sample CSV
datetime,x,y,z,label
1493740845,0.0004,-0.0001,0.0045,bad
1493740846,0.0003,0.0002,0.0047,bad
1493740847,0.0005,0.0001,0.0049,bad
1493740848,0.0006,0.0004,0.005,bad
1493740849,0.0006,-0.0003,0.005,bad
1493740851,0.0001,-0.0003,0.0039,bad
1493740852,-0.0006,0.0003,0.0046,bad
B1493740853,0.0001,0.0,0.0048,bad
Output:
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,group_timestamp,label,x_abs_max,x_abs_min,x_mean,x_median,x_raw_max,x_raw_min,x_stdev,y_abs_max,y_abs_min,y_mean,y_median,y_raw_max,y_raw_min,y_stdev,z_abs_max,z_abs_min,z_mean,z_median,z_raw_max,z_raw_min,z_stdev
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
How to fix this?
Additionally : If you can help me to simplify the code more.
There is problem for each loop in groupby is necessary append values to row list and then append to rows outside loop for nested lists, so possible pass to DataFrame cosntructor in last step:
#added for nested lists (outside loops)
rows = []
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
#added for row per loop
row = []
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
....
row.append(z_abs_max)
rows.append(row)
#DataFrame outside loops
group_df = pd.DataFrame(rows, columns=column_names)
print (group_df)
Your solution should be improved by GroupBy.agg:
#custom aggregate functions
def std_dev(x):
return statistics.stdev(x)
def abs_min(x):
return x.abs().min()
def abs_max(x):
return x.abs().max()
d = ['mean','median',std_dev, 'min','max', abs_min, abs_max]
cols = ['x','y','z']
#filtered first 50 rows
df[cols] = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].head(50)
#aggregate functions
group_df = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].agg(d)
group_df.columns = group_df.columns.map('_'.join)
print (group_df)
Related
Rank the row based on the similar text using python?
How to rank the data frame based on the row value. i.e I have a row that contains text data want to provide the rank based on the similarity? Expected output i have tried with the levistian distance but not sure how can i do for the whole table def bow(x=None): x = x.lower() words = x.split(' ') words.sort() x = ' '.join(words) exclude = set('{}{}'.format(string.punctuation, string.digits)) x = ''.join(ch for ch in x if ch not in exclude) x = '{} '.format(x.strip()) return x #intents = load_intents(export=True) df['bow'] = df['name'].apply(lambda x: bow(x)) df.sort_values(by='bow',ascending=True,inplace=True) last_bow = '' recs = [] for idx,row in df.iterrows(): record = { 'name': row['name'], 'bow': row['bow'], 'lev_distance': ed.eval(last_bow,row['bow']) } recs.append(record) last_bow = row['bow'] intents = pd.DataFrame(recs,columns=['name','bow','lev_distance']) l = intents[intents['lev_distance'] <= lev_distance_range] r = [] for x in l.index.values: r.append(x - 1) r.append(x) r = list(set(r)) l = intents.iloc[r,:]
Using textdistance, you could try this: import pandas as pd import textdistance df = pd.DataFrame( { "text": [ "Rahul dsa", "Rasul dsad", "Raul ascs", "shrez", "Indya", "Indi", "shez", "india", "kloa", "klsnsd", ], } ) df = ( df .assign( match=df["text"].map( lambda x: [ i for i, text in enumerate(df["text"]) if textdistance.jaro_winkler(x, text) >= 0.9 ] ) ) .sort_values(by="match") .drop(columns="match") ) print(df) # Output text 0 Rahul dsa 1 Rasul dsad 2 Raul ascs 3 shrez 6 shez 4 Indya 5 Indi 7 india 8 kloa 9 klsnsd
How to groupby specifically datetime index in a multiindex column by month
The data frame shows the date with the amount of import and export and it is further bifurcated into coastal and regional data per day of one month. What I wish to achieve is to club i.e sum all the data presented, which is of one month in this case, in the end, it will show only one entry that will be of month ending date and adding all the corresponding fields. This is the following code: df=pd.read_csv('output.csv', encoding="utf-8",skipinitialspace=True,engine='python') datadf = df datadf = datadf.dropna(axis = 0, how ='any') datadf = datadf.astype({'ForeignType' : 'category','ImportType' : 'category','ArrDate' : 'datetime64', 'DepDate' : 'datetime64'}) # datadf = datadf.groupby(datadf['ArrDate'].dt.strftime('%B'))['ComoQty'].sum() datadf1 = datadf.groupby(['ArrDate','ImportType','ForeignType'])['ComoQty'].sum() datadf2 = datadf1.to_frame() datadf2.fillna(value=0,inplace=True) # datadf2 = datadf2.reset_index('ImportType') # datadf2 = datadf2.reset_index('ForeignType') # datadf2 = datadf2.reset_index('ArrDate') datadf2
datadf1 = datadf.drop(columns='Unnamed: 0') prac = datadf1 prac =prac.set_index('ArrDate') prac_dates = prac.copy() prac = prac.resample('D').apply({'ShipName':'count','ComoQty':'sum'}).reset_index() prac_dates = ((prac_dates.resample('M').apply({'ComoQty':'sum'}))/1000).reset_index() prac_dates['Month'] = pd.DatetimeIndex(prac_dates['ArrDate']).strftime('%B') del prac_dates['ArrDate'] # prac_dates prac['Month'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%B') # prac['Month'] = pd.to_datetime(prac['Month'], format='%B') prac['ArrDate'] = pd.DatetimeIndex(prac['ArrDate']).strftime('%d')
How to improve my pandas efficiency when there is many selections
I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs. I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe. The problem is it is very very slow, and become slower while running. The mainly code is: def process_df(df, func, *args, **kwargs): ''' ''' product_ids = df.product_id.unique() store_ids = df.store_id.unique() # uk = df.drop_duplicates(subset=['store_id','product_id']) # for idx, item in uk.iterrows(): all_df = list() i = 1 with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t: for store_id in store_ids: sdf = df.loc[df['store_id']==store_id] for product_id in product_ids: new_df = sdf.loc[(sdf['product_id']==product_id) ] if new_df.shape[0] < 14: continue new_df = func(new_df, *args, **kwargs) new_df.loc[:, 'store_id'] = store_id new_df.loc[:, 'product_id'] = product_id all_df.append(new_df) t.update() all_df= pd.concat(all_df) return all_df def process_order_items(df, store_id=None, product_id=None, freq='D'): if store_id and "store_id" in df.columns: df = df.loc[df['store_id']==store_id] if product_id and "product_id" in df.columns: df = df.loc[df['product_id']==product_id] # convert to datetime df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None) df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]] df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int) df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int) if not freq: df.loc[:, "date_create"] = df["datetime_create"] else: assert freq in ('D', 'H') df.index = df.loc[:, "datetime_create"] discount_order_count = df['has_discount'].resample(freq).sum() clearance_order_count = df['clearance'].resample(freq).sum() discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum() clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum() no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum() no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum() order_count = df['count'].resample(freq).count() day_count = df['count'].resample(freq).sum() price_guide = df['price_guide'].resample(freq).max() price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count df = pd.DataFrame({ "price":price_avg, "price_guide": price_guide, "sale_count": day_count, "order_count": order_count, "discount_order_count": discount_order_count, "clearance_order_count": clearance_order_count, "discount_sale_count": discount_sale_count, "clearance_sale_count": clearance_sale_count, }) df = df.drop(df[df.order_count == 0].index) return df I think the problem is there are too many redundant selections. Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.
I think you can change: df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int) to Series.isin: df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int) Also solution for Resampler.aggregate: d = {'has_discount':'sum', 'clearance':'sum', 'count': ['count', 'sum'], 'price_guide':'max'} df1 = df.resample(freq).agg(d) df1.columns = df1.columns.map('_'.join) d1 = {'has_discount_count':'discount_order_count', 'clearance_count':'clearance_order_count', 'count_count':'order_count', 'count_sum':'day_count', 'price_guide_max':'price_guide'} df1.rename(columns=d1) Another idea is no convert boolean mask to integer, but use columns for filtering like: df["has_discount"] = df.price_change_category > 0 df["clearance"] = df.price_change_category.isin([10, 20, 23]) discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum() clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum() #for filtering ==0 invert boolean mask columns by ~ no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum() no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum() First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary: def f(x): print (x) df = df.groupby(['product_id','store_id']).apply(f)
Aroon Indicator in PySpark: How to count the number of rows between the max value and the current value in each group
Arron Up, Aroon Dn, Aroon Oscillator def myFunction(myData): df = myData col = 'Date' groupCols = ['Name'] window=Window.partitionBy(groupCols).orderBy(df[col].asc()).rowsBetween(-11, 0) max_value = max(df['value']).over(window) min_value = min(df['value']).over(window) df = df.withColumn('max', max_value) df = df.withColumn('min', min_value) I need to find the number of rows between the max value and the current value for each group periods_since_max = periods_since_min = df = df.withColumn('periods_since_max', periods_since_max) df = df.withColumn('periods_since_min', periods_since_min) aroon_up = ((12.0 - df['periods_since_max'])/12.0) * 100 aroon_dn = ((12.0 - df['periods_since_min'])/12.0) * 100 df = df.withColumn('aroon_up', aroon_up) df = df.withColumn('aroon_dn', aroon_dn) oscillator = df['aroon_up'] - df['aroon_dn'] df = df.withColumn('oscillator', oscillator) return df
I want to create a time series of monthly means in Pandas
I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values. #Cleaning data ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python') colnames = ['Date', 'Time', 'NO2'] ck_2000.columns = colnames #Reformat date/time ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True) dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S') ck_2000.index = dtw #Index dataframe by date firstDate = ck_2000.index[0] lastDate = ck_2000.index[len(ck_2000.Date) - 1] ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None) #Change data type to float ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64') #Interpolation ck_2000_int = ck_2000.interpolate() #df's for all months ck_2000_jan = ck_2000_int['2000-01'] ck_2000_feb = ck_2000_int['2000-02'] ck_2000_mar = ck_2000_int['2000-03'] ck_2000_apr = ck_2000_int['2000-04'] ck_2000_may = ck_2000_int['2000-05'] ck_2000_jun = ck_2000_int['2000-06'] ck_2000_jul = ck_2000_int['2000-07'] ck_2000_aug = ck_2000_int['2000-08'] ck_2000_sept = ck_2000_int['2000-09'] ck_2000_oct = ck_2000_int['2000-10'] ck_2000_nov = ck_2000_int['2000-11'] ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample Consider the following example tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H') ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx) ck_2000_int.resample('M').mean().plot()