pd.dataframe saving only one line

pd.dataframe saving only one line - python

Hi i'm wondering what should i do to save all those values in a dataframe...
for mask in range (len(predicted_masks)):
folha = np.where(predicted_masks [mask,:,:] == 1 , 1, 0)
soma_folha = np.sum(folha)
sintoma = np.where(predicted_masks [mask,:,:] == 2 , 1, 0)
soma_sintoma = np.sum(sintoma)
fundo = np.where(predicted_masks [mask,:,:] == 0 , 1, 0)
soma_fundo = np.sum(fundo)
#print(soma_fundo, soma_folha, soma_sintoma)
severidade = (soma_sintoma/(soma_folha+soma_sintoma))*100
severidade = round(severidade,2)
print(soma_fundo, soma_folha, soma_sintoma, severidade)
d = {'mask': mask, 'soma_folha':soma_folha, 'soma_sintoma':soma_sintoma, 'soma_fundo':soma_fundo, 'severidade': severidade}
df = pd.DataFrame([d])
df.to_csv('/content/drive/MyDrive/DB_mosca_minadora/pred_csv/pred_test_db_anotated.csv', index=False)
already tried to save each one separately but it wont came up..
i needed to save all printed values in a dataframe, thats for 304 images (304 lines) buts it only saves the last line
can someone help me?

You are overwriting and saving your dataframe within the loop. You should instead do something like the following:
df = pd.DataFrame(columns=['mask', 'soma_folha', 'soma_sintoma', 'soma_fundo', 'severidade'])
for mask in range (len(predicted_masks)):
folha = np.where(predicted_masks [mask,:,:] == 1 , 1, 0)
soma_folha = np.sum(folha)
sintoma = np.where(predicted_masks [mask,:,:] == 2 , 1, 0)
soma_sintoma = np.sum(sintoma)
fundo = np.where(predicted_masks [mask,:,:] == 0 , 1, 0)
soma_fundo = np.sum(fundo)
#print(soma_fundo, soma_folha, soma_sintoma)
severidade = (soma_sintoma/(soma_folha+soma_sintoma))*100
severidade = round(severidade,2)
print(soma_fundo, soma_folha, soma_sintoma, severidade)
d = {'mask': mask, 'soma_folha':soma_folha, 'soma_sintoma':soma_sintoma, 'soma_fundo':soma_fundo, 'severidade': severidade}
new_df = pd.DataFrame([d])
df = pd.concat([df, new_df])
df.to_csv('/content/drive/MyDrive/DB_mosca_minadora/pred_csv/pred_test_db_anotated.csv', index=False)

Related

Scorer not recognizing inputs

I'm trying to utilize a custom scorer with the following code
def edge_score(y, y_pred):
y_pred.name = 'y_pred'
y.name = 'y'
df = pd.concat([y_pred, y])
df['sign_pred'] = df.y_pred.apply(np.sign)
df['sign_true'] = df.y.apply(np.sign)
df['is_correct'] = 0
df.loc[
df.sign_pred * df.sign_true > 0, 'is_correct'] = 1
df['is_incorrect'] = 0
df.loc[
df.sign_pred * df.sign_true < 0, 'is_incorrect'] = 1
df['is_predicted'] = df.is_correct + df.is_incorrect
df['result'] = df.sign_pred * df.y
df['edge'] = df.result.mean()
output_errors = df[['edge']]
output_errors.to_numpy()
return np.average(output_errors)
edge = make_scorer(edge_score)
I get the following error
AttributeError: 'numpy.ndarray' object has no attribute 'name'
When I comment out the .name lines, I get the following error
TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid
When I convert true and predictions to dataframe, I get the following error
y_pred = pd.DataFrame(y_pred)
y = pd.DataFrame(y)
AttributeError: 'DataFrame' object has no attribute 'y_pred'
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer

You should first create a DataFrame with the two numpy arrays y and y_pred, and then perform all the operations.
def edge_score(y, y_pred):
df = pd.DataFrame({"y":y,
"y_pred":y_pred})
df['sign_pred'] = df.y_pred.apply(np.sign)
df['sign_true'] = df.y.apply(np.sign)
df['is_correct'] = 0
df.loc[
df.sign_pred * df.sign_true > 0, 'is_correct'] = 1
df['is_incorrect'] = 0
df.loc[
df.sign_pred * df.sign_true < 0, 'is_incorrect'] = 1
df['is_predicted'] = df.is_correct + df.is_incorrect
df['result'] = df.sign_pred * df.y
df['edge'] = df.result.mean()
output_errors = df[['edge']]
output_errors.to_numpy()
return np.average(output_errors)
edge = make_scorer(edge_score)

Change these lines of code
df['sign_pred'] = df.y_pred.apply(np.sign)
df['sign_true'] = df.y.apply(np.sign)
to these:
df['sign_pred'] = np.sign(y_pred)
df['sign_true'] = np.sign(y)

def custom_score(y_true, y_pred):
true_sign = np.sign(y_true)
pred_sign = np.sign(y_pred)
true_vs_pred = np.where(true_sign == pred_sign, 1, 0)
true_pred = (true_vs_pred == 1).sum()
return true_pred
custom_scorer = make_scorer(custom_score, greater_is_better=True)
convert everything to an array and then process that.

How Can I display only one column from a styled Pandas Dataframe?

So I am writing a python program that will pull in data and highlight rows based on a cell value. However, I want to only convert/show one of the highlighted columns(The name of the device) to HTML (and ultimately a pdf) which I can then display to a customer. How could I go about doing this?
Code:
df = pd.DataFrame(items)
def highlightCritical(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: red' if isSDT.any() else '' for v in isSDT]
def highlightError(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: orange' if isSDT.any() else '' for v in isSDT]
def highlightWarning(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: yellow' if isSDT.any() else '' for v in isSDT]
styled_df_a = df.style.apply(highlightWarning, threshold = 0, column = ['warnAlert' , 'confirmedWarnAlert'], axis=1)\
.apply(highlightError, threshold = 0, column = ['errorAlert' , 'confirmedErrorAlert'],axis=1)\
.apply(highlightCritical, threshold = 0, column = ['criticalAlert', 'confirmedCriticalAlert'], axis=1)
html = styled_df_a.hide_index().render()
with open("html_c.html","w") as fp:
fp.write(html)

I fixed the issue. For anybody wondering, you can add the style.hide_columns(LIST OF COLUMNS) to the end of your Styler object.
Code:
styled_df_a = df.style.apply(highlightWarning, threshold = 0, column = ['warnAlert' , 'confirmedWarnAlert'], axis=1)\
.apply(highlightError, threshold = 0, column = ['errorAlert' , 'confirmedErrorAlert'],axis=1)\
.apply(highlightCritical, threshold = 0, column = ['criticalAlert', 'confirmedCriticalAlert'], axis=1)\
.hide_columns(columnList)

Explanations on np.delete to remove all a last block of 2 columns

I need to understand if, with the below using of np.delete, we perform a removing of all block of 2 columns and len(pk)-1 rows OR if we just delete the last line composed of 2 columns hk and pk indexed by len(pk)-1.
# Save all the fiducial power spectrums.
aaa = len(zrange)
while aaa >= 0:
if aaa == len(zrange):
kh, pk = np.loadtxt(
"test_matterpower_" + str(aaa) + ".dat",
usecols=(
0,
1,
),
unpack=True,
)
elif aaa > 0:
kh1, pk1 = np.loadtxt(
"test_matterpower_" + str(aaa) + ".dat",
usecols=(
0,
1,
),
unpack=True,
)
kh = np.vstack((kh, kh1))
pk = np.vstack((pk, pk1))
else:
kh1, pk1 = np.loadtxt(
"test_matterpower_" + str(len(zrange) + 1) + ".dat",
usecols=(
0,
1,
),
unpack=True,
)
kh = np.vstack((kh, kh1))
pk = np.vstack((pk, pk1))
aaa = aaa - 1
kh = np.delete(kh, len(kh) - 1, axis=0)
pk = np.delete(pk, len(pk) - 1, axis=0)
with open("pkkh_seq", "w") as f:
f.write(str(pk) + str(kh))
outP = open(fold_path_fid[0] + "/Pks8sqRatio_ist_LogSplineInterpPk.dat", "w")
Indeed, I would like to remove all the last block, i.e remove the last len(pk)-1 rows (composed of 2 columns).
Is the syntax correct here to perform this ?

How to improve my pandas efficiency when there is many selections

I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs.
I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe.
The problem is it is very very slow, and become slower while running.
The mainly code is:
def process_df(df, func, *args, **kwargs):
'''
'''
product_ids = df.product_id.unique()
store_ids = df.store_id.unique()
# uk = df.drop_duplicates(subset=['store_id','product_id'])
# for idx, item in uk.iterrows():
all_df = list()
i = 1
with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t:
for store_id in store_ids:
sdf = df.loc[df['store_id']==store_id]
for product_id in product_ids:
new_df = sdf.loc[(sdf['product_id']==product_id) ]
if new_df.shape[0] < 14:
continue
new_df = func(new_df, *args, **kwargs)
new_df.loc[:, 'store_id'] = store_id
new_df.loc[:, 'product_id'] = product_id
all_df.append(new_df)
t.update()
all_df= pd.concat(all_df)
return all_df
def process_order_items(df, store_id=None, product_id=None, freq='D'):
if store_id and "store_id" in df.columns:
df = df.loc[df['store_id']==store_id]
if product_id and "product_id" in df.columns:
df = df.loc[df['product_id']==product_id]
# convert to datetime
df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]]
df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int)
df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
if not freq:
df.loc[:, "date_create"] = df["datetime_create"]
else:
assert freq in ('D', 'H')
df.index = df.loc[:, "datetime_create"]
discount_order_count = df['has_discount'].resample(freq).sum()
clearance_order_count = df['clearance'].resample(freq).sum()
discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum()
no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum()
no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum()
order_count = df['count'].resample(freq).count()
day_count = df['count'].resample(freq).sum()
price_guide = df['price_guide'].resample(freq).max()
price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count
df = pd.DataFrame({
"price":price_avg,
"price_guide": price_guide,
"sale_count": day_count,
"order_count": order_count,
"discount_order_count": discount_order_count,
"clearance_order_count": clearance_order_count,
"discount_sale_count": discount_sale_count,
"clearance_sale_count": clearance_sale_count,
})
df = df.drop(df[df.order_count == 0].index)
return df
I think the problem is there are too many redundant selections.
Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.

I think you can change:
df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
to Series.isin:
df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int)
Also solution for Resampler.aggregate:
d = {'has_discount':'sum',
'clearance':'sum',
'count': ['count', 'sum'],
'price_guide':'max'}
df1 = df.resample(freq).agg(d)
df1.columns = df1.columns.map('_'.join)
d1 = {'has_discount_count':'discount_order_count',
'clearance_count':'clearance_order_count',
'count_count':'order_count',
'count_sum':'day_count',
'price_guide_max':'price_guide'}
df1.rename(columns=d1)
Another idea is no convert boolean mask to integer, but use columns for filtering like:
df["has_discount"] = df.price_change_category > 0
df["clearance"] = df.price_change_category.isin([10, 20, 23])
discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum()
#for filtering ==0 invert boolean mask columns by ~
no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum()
no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum()
First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary:
def f(x):
print (x)
df = df.groupby(['product_id','store_id']).apply(f)

Python - How to create an Excel Calculated Field without modifying original source of Data

I have 2 tables on Excel:
.
I've created an excel Pivot Table using Python but I could not find a simple way to create a calculated field inside it (like I would do with VB) which matches Region from left table and Region from right table.
So I did this, using the module win32com.client:
First, stored the content of the tables in two lists : myTable and myRates.
Then, added a new column to the original left table where I calculated CA * (1 + rate). The code here:
calField = [['CA Bonifié']] #first element as a title for the new column :
for a, testMyTable in enumerate(myTable):
for b, testMyRates in enumerate(myRates):
if a >0 and b > 0:
if testMyTable[0] == testMyRates[0]:
calField.append( [ testMyTable[ len(testMyTable)-1 ] * ( 1+testMyRates[1] ) ] )
for i, testDataRow in enumerate(calField):
for j, testDataItem in enumerate(testDataRow):
Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
What it does in the sheet "source":
What it does in the created sheet "TCD":
Result is ok but I don't like this method as it alterates the original table. So I'm looking a simplest method to do that.
Thanks in advance for your help
PS : The whole code below. May it help.
import win32com.client
Excel = win32com.client.gencache.EnsureDispatch('Excel.Application')
win32c = win32com.client.constants
Excel.Visible = True
wb = Excel.Workbooks.Open('C:/Users/Documents/Python/classeur.xlsx')
Sheet1 = wb.Worksheets('Source')
def getContiguousRange(fichier, sheet, row, col):
bottom = row
while sheet.Cells(bottom + 1, col).Value not in [None, '']:
bottom = bottom + 1
right = col
while sheet.Cells(row, right + 1).Value not in [None, '']:
right = right + 1
return sheet.Range(sheet.Cells(row, col), sheet.Cells(bottom, right)).Value
myTable = getContiguousRange(fichier = wb, sheet = Sheet1, row = 1, col = 1)
myRates = getContiguousRange(fichier = wb, sheet = Sheet1, row = 1, col = 8)
calField = [['CA Bonifié']]
for a, testMyTable in enumerate(myTable):
for b, testMyRates in enumerate(myRates):
if a >0 and b > 0:
if testMyTable[0] == testMyRates[0]:
calField.append( [ testMyTable[ len(testMyTable)-1 ] * ( 1+testMyRates[1] ) ] )
for i, testDataRow in enumerate(calField):
for j, testDataItem in enumerate(testDataRow):
Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
cl1 = Sheet1.Cells(1,1)
cl2 = Sheet1.Cells(len(myTable),len(myTable[0])+1)
pivotSourceRange = Sheet1.Range(cl1,cl2)
pivotSourceRange.Select()
Sheet2 = wb.Sheets.Add (After=wb.Sheets (1))
Sheet2.Name = 'TCD'
cl3=Sheet2.Cells(4,1)
pivotTargetRange= Sheet2.Range(cl3,cl3)
pivotTableName = 'tableauCroisé'
pivotCache = wb.PivotCaches().Create(SourceType=win32c.xlDatabase, SourceData=pivotSourceRange, Version=win32c.xlPivotTableVersion14)
pivotTable = pivotCache.CreatePivotTable(TableDestination=pivotTargetRange, TableName=pivotTableName, DefaultVersion=win32c.xlPivotTableVersion14)
pivotTable.PivotFields('Service').Orientation = win32c.xlRowField
pivotTable.PivotFields('Service').Position = 1
pivotTable.PivotFields('Region').Orientation = win32c.xlPageField
pivotTable.PivotFields('Region').Position = 1
pivotTable.PivotFields('Region').CurrentPage = 'IDF'
dataField = pivotTable.AddDataField(pivotTable.PivotFields('CA'))
dataField.NumberFormat = '# ### €'
calculField = pivotTable.AddDataField(pivotTable.PivotFields('CA Bonifié'))
calculField.NumberFormat = '# ### €'
# wb.SaveCopyAs('C:/Users/Documents/Python/tcd.xlsx')
# wb.Close(True)
# Excel.Application.Quit()

Note: I'm using Sheet1 as the Image show all relevant indices and its easier to verify.
You can move the Formula to the PivotTabel at a later Step, once verified.
STEP Replace Column E with the Formula =VLOOKUP
Reference: how-to-use-vlookup-match
Replace the following in your Code:
for row, testDataRow in enumerate(calField, 2):
#Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
Sheet1.Cells(row, 5).Formula = '=VLOOKUP(A{}, H1:I5, MATCH(H1,H1:I1))'.format(row)
The Result should show the matching Taux!
Come back and confirm Results are OK!
STEP Compute Taux

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

pd.dataframe saving only one line - python

Related

Scorer not recognizing inputs

How Can I display only one column from a styled Pandas Dataframe?

Explanations on np.delete to remove all a last block of 2 columns

How to improve my pandas efficiency when there is many selections

Python - How to create an Excel Calculated Field without modifying original source of Data

Categories

Resources