How to Fix ''Level None not found' Error in pandas? - python

Facing following error when running code
Level None not found
pt = df.pivot_table(index = 'User Name',values = ['Threat Score', 'Score'],
aggfunc = {
'Threat Score': np.mean,
'Score' :[np.mean, lambda x: len(x.dropna())]
},
margins = True)
pt = pt.sort_values('Score', ascending = False)
I want to take the average value of Threat Score & Score, also count of the user name. Then sort by Threat Score high to low.

Its a bug in pandas this is a github link for the same. This error comes with with multiple aggregations per column and margins=True, it won't come if you choose flag margins = False. you can add them later if you want. That sure will work:
pt = df.pivot_table(index = 'User Name',values = ['Threat Score', 'Score'],
aggfunc = {
'Threat Score': np.mean,
'Score' :[np.mean, lambda x: len(x.dropna())]
},
margins = False)
pt = pt.sort_values('Score', ascending = False)
let me know if this works for you

pt = df.pivot_table(index = 'User Agent', values = ['Threat Score', 'Score','Source IP'] ,
aggfunc = {"Source IP" : 'count',
'Threat Score':np.mean,
'Score': np.mean})
pt = pt.sort_values('Threat Score', ascending = False)
new_cols = ['Avg_Score', 'Count', 'Avg_ThreatScore']
pt.columns = new_cols
pt.to_csv(Path3 + '\\AllUserAgent.csv')

Related

Iterows replacement for a calculation between each row of one dataframe to another

I'm trying to move away from iterows due to it's poor proformance. I can't however find another solution to comparing each row of one dataframe with each row from another dataframe.
I have two dataframes each containing a latitude and a longitude. Previously I have used these two functions to make a distance calculation between the two coordinates shown here:
def find_matches(first_HL, second_HL, N, M):
program_start = time.time()
matched_sites_df = pd.DataFrame()
for i_WP, r_WP in first_HL.iterrows():
series = pd.Series(dtype=float)
if r_WP['PL Name'] is not None and r_WP['PL Latitude'] is not None and r_WP['PL Longitude'] is not None:
series = name_and_distance_match(i_WP, r_WP, second_HL, N, M)
if series is not None:
series = pd.DataFrame(series.to_frame().T)
matched_sites_df = pd.concat([matched_sites_df, series], axis=0, ignore_index=True)
now = time.time()
print("------ MATCH FOUND ------ ", r_WP['PL Name'], "------", round(now - program_start, 2), "seconds")
return matched_sites_df
def calc_distance(r_WP, r_HL):
coords_1 = (r_WP['PL Latitude'], r_WP['PL Longitude'])
coords_2 = (r_HL['Latitude'], r_HL['Longitude'])
distance_km = round(geopy.distance.geodesic(coords_1, coords_2).km, 2)
return distance_km
def name_and_distance_match(i_WP, r_WP, second_HL, N, M):
for i_HL, r_HL in second_HL.iterrows():
if pd.isnull(r_HL['Site Name']) or pd.isnull(r_WP['PL Name']) == True:
pass
elif abs(r_WP['PL Latitude'] - r_HL['Latitude']) > 0.1:
pass
elif abs(r_WP['PL Longitude'] - r_HL['Longitude']) > 0.1:
pass
else:
distance_km = r_WP['Distance (km)'] = calc_distance(r_WP, r_HL)
if distance_km < M:
r_HL = filter_town(r_WP, r_HL)
score = r_WP['Name Similarity'] = np.vectorize(fuzzy)(r_HL["HL Site Short"], r_WP['PL Name'])
if score > N:
r_WP["HL Site Short"] = r_HL["HL Site Short"]
return r_WP
Is there a way I can do this without iterows?
The solution I'm working on at the moment looks like this:
def distance_check(first_HL, second_WPHL):
first_lat = first_HL["Latitude"]
first_long = second_WPHL["PL Longitude"]
second_lat = first_HL["Latitude"]
second_long = second_WPHL["PL Longitude"]
if abs(first_lat - second_lat) + abs(first_long - second_long) > 0.2:
return False
else:
COMBINED_HOUSELIST["WHATPUB Site Name"] = PUBMATCH_WHATPUB_SITES["Site Name"]
return True
PUBMATCH_WHATPUB_SITES
COMBINED_HOUSELIST["Distance Check"] = COMBINED_HOUSELIST.apply(distance_check(PUBMATCH_WHATPUB_SITES, COMBINED_HOUSELIST), axis=1)
Any help would be greatly appreciated, thank you.
EDIT: Example Dataframes
COMBINED_HOUSELIST = pd.DataFrame(np.array([["12345", "Wrexham Cwtch", "52.10", "-2.06"], ["12354", "Horse & Hound", "52.21", "-1.95"], ["12435", "Round Of Gras Badsey", "52.33", "-1.99"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
PUBMATCH_WHATPUB_SITES= pd.DataFrame(np.array([["52938", "Valkyrie Café Bar", "53.22", "-3.00"], ["12435", "Round Of Badsey", "52.33", "-1.99"], ["12345", "Cwtch", "52.11", "-2.00"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
Desired output
matched_sites = pd.DataFrame(np.array([["12345", "Wrexham Cwtch", "52.10", "-2.06"], ["12354", "Horse & Hound", "52.21", "-1.95"], ["12435", "Round Of Gras Badsey", "52.33", "-1.99"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
One way or another, I fear that you will have to resort to some form of iteration, but doing it outside of Pandas might speed things up.
So, here is one way to do it with map and partial functions from Python standard library.
First, define two helper functions:
from functools import partial
def calc_distance(coo1, coo2):
return abs(coo1[0] - coo2[0]) + abs(coo1[1] - coo2[1])
def find_matches(one_list, another_list, threshold):
idx = []
for coo in one_list:
func = partial(calc_distance, coo)
results = [result for result in map(func, another_list)]
idx.append([results.index(result) for result in results if result <= threshold])
return idx
Then, with the following toy dataframes:
import pandas as pd
import numpy as np
COMBINED_HOUSELIST = pd.DataFrame(
np.array(
[
["12345", "Wrexham Cwtch", "52.10", "-2.06"],
["12354", "Horse & Hound", "52.21", "-1.95"],
["12435", "Round Of Gras Badsey", "52.33", "-1.99"],
]
),
columns=["Site Number", "Site Name", "Longitude", "Latitude"],
)
PUBMATCH_WHATPUB_SITES = pd.DataFrame(
np.array(
[
["52938", "Valkyrie Café Bar", "53.22", "-3.00"],
["54999", "New Café Bar", "52.10", "-2.1"],
["12435", "Round Of Badsey", "52.33", "-1.99"],
["12345", "Cwtch", "52.11", "-2.00"],
]
),
columns=["Site Number", "Site Name", "Longitude", "Latitude"],
)
You can proceed like this:
# Setup
for col in ["Latitude", "Longitude"]:
for df in [COMBINED_HOUSELIST, PUBMATCH_WHATPUB_SITES]:
df[col] = pd.to_numeric(df[col])
# Get two lists of coordinates looking like [[lat, long], [lat, long],...]
CH_COO = COMBINED_HOUSELIST.loc[:, ["Latitude", "Longitude"]].to_dict("split")["data"]
PW_COO = PUBMATCH_WHATPUB_SITES.loc[:, ["Latitude", "Longitude"]].to_dict("split")[
"data"
]
# Look for matches
COMBINED_HOUSELIST = COMBINED_HOUSELIST.assign(match=find_matches(CH_COO, PW_COO, 0.1))
# Get site names
COMBINED_HOUSELIST["match"] = COMBINED_HOUSELIST.apply(
lambda x: [PUBMATCH_WHATPUB_SITES.loc[idx, "Site Name"] for idx in x["match"]],
axis=1,
)
Finally, print(COMBINED_HOUSELIST):

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

Add new column to DataFrame with same default value

I would like to add a name column based on the 'lNames' list. But my code is overwriting the whole column in the last iteration as follows:
import pandas as pd
def consulta_bc(codigo_bcb):
url = 'http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(codigo_bcb)
df = pd.read_json(url)
df['data'] = pd.to_datetime(df['data'], dayfirst=True)
df.set_index('data', inplace=True)
return df
lCodigos = [12, 11, 1, 21619, 21623, 12466]
lNames = ['CDI', 'SELIC', 'USD', 'EUR', 'GPB', 'IMAB']
iter_len = len(lCodigos)
saida = pd.DataFrame()
for i in range(iter_len):
saida = saida.append(consulta_bc(lCodigos[i]))
saida['nome']= lNames[i]
saida.to_csv('Indice', sep=';', index=True)
saida
Any help will be fully appreciated
Change the for loop in this way:
for i in range(iter_len):
df = consulta_bc(lCodigos[i])
df['nome'] = lNames[i]
saida = saida.append(df)

Read Excel file without using Pandas and add new columns and print and output file

I am new to python coding and due to some issue, I need to reconfigure my code without pandas.
I am reading an Excel and extracting a few columns with filtered values. Then passing the one column value to a function to fetch the results. The result comes back in a complex dictionary format then I have to create a new column from the dictionary then join the two outputs (initial Excel file and complex dictionary) and print that back in the output file.
So my data is
Customer Customer Name Serial Number
1 XYZ 101011
2 XYZ 1020123
3 XYX 102344
Dictionary output
[{'cert': {'alternate_names': [],
'created_on': '2017-09-10T16:15:25.7599734Z',
'csr_used': False,
'error_details': '',
'revocation_date': None,
'revocation_status': None,
'serial_no': '101011',
'status': 'Expired',
'valid_to': '2020-09-09T23:59:59.0000000Z'},
'meta': {'api_application_biz_unit': '',
'api_client_nuid': '',
'asset_name': '',
'audience': 'External',
'automation_utility': '',
'delegate_owner': '',
'environment': 'Development',
'l2_group_email': None,
'l3_group_email': None,
'requestor_email': '',
'support_email': '',
'tech_delegate_email': None,
'tech_owner_email': None}}]
Desired output:
Customer Customer Name Serial Number Alternate_name Audience Environment
1 XYZ 101011 [] External Dev
My Code:
def create_excel(filename):
data = pd.read_excel(filename, usecols=[4,18,19,20,26,27,28])
data["Customer Name"].fillna("N/A",inplace= True)
df = data[data['Customer Name'].str.contains("XYZ",case = False)]
output = df['Serial Number'].apply(lambda x: fetch_by_ser_no(x))
df2 = pd.DataFrame(output)
df2.columns = ['Output']
df5 = pd.concat([df,df2],axis = 1)
df3 = pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df2['Output']],
ignore_index=False)
df3["Serial Number"] = df3.iloc[:,11]
df4 = pd.merge(left = df5, right = df3, how = 'left',
left_on = df5["Serial Number"].str.lower(),
right_on = df3["Serial Number"].str.lower())
df4.fillna("N/A",inplace = True)
df4["Status"] = df4.iloc[:,21].replace({"N/A":"Cust Not Found"},inplace = True)
df4["Status"] = df4.iloc[:,21]
df4["Serial Number"] = df4.iloc[:,4]
df4["Audience"] = df4.iloc[:,30]
df4["Environment"] = df4.iloc[:,33]
df4[["Customer","Customer Name","Serial Number","Common Name","Status",
"Environment","Audience"]].to_excel(r'Data.xlsx', index = False)
I want to remove the pandas dependency from the code. I am having a hard time figuring this out.

How to use pandas INPUT function to get a list of customers

I have created a code to get users of my platform based on 2 things:
choiceTitle: search for a specific word contained in the title of an Ad that users of my platform have looked at. For eg, the Ad is "We are offering free Gin" and I want to get the word 'Gin'
PrimaryTagPreviousChoice: the Ad has a "Food and Drink" tag
I can get those users who are interested in Gin and Food and Drink with:
(df2['choiceTitle'].str.contains("(?i)Gin")) & (df2['PrimaryTagPreviousChoice'].str.contains("(?i)Food and Drink"))
What I'd like to do is create a function with all my code inside (hence the sql query, the rename operation, the sort_values ​​operation etc....) and then use the INPUT function. So I'll just have to run my code, so that python will ask me 2 questions:
choiceTitle? ... Gin
PrimaryTagPreviousChoice? ...Food and Drink.
I enter the 2 options and it gives me the users interested in, let's say, Gin and Food and Drink.
How can I do it?
MY CODE:
df = pd.read_sql_query(""" select etc..... """, con)
df1 = pd.read_sql_query(""" select etc..... """, con)
df1['user_id'] = df1['user_id'].apply(str)
df2 = pd.merge(df, df1, left_on='user_id', right_on='user_id', how='left')
tag = df2[
(df2['choiceTitle'].str.contains("(?i)Gin")) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)Food and Drink"))
]
dw = tag[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
dw = dw.sort_values(['last_login'], ascending=[False])
dw = dw[dw.last_login > dt.datetime.now() - pd.to_timedelta("30day")]
dw = dw.rename({'user': 'user full name', 'title': 'user title'}
, axis='columns')
dw.drop_duplicates(subset ="Email",
keep = 'first', inplace = True)
Adding a function in Python is simple. Just use the def keyword to declare the function and put your existing code under it (indented). Put parameters in the parenthesis.
Here is the updated code:
def GetUsers (title, tag)
df = pd.read_sql_query(""" select etc..... """, con)
df1 = pd.read_sql_query(""" select etc..... """, con)
df1['user_id'] = df1['user_id'].apply(str)
df2 = pd.merge(df, df1, left_on='user_id', right_on='user_id', how='left')
tag = df2[
(df2['choiceTitle'].str.contains("(?i)" + title)) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)" + tag))]
dw = tag[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
dw = dw.sort_values(['last_login'], ascending=[False])
dw = dw[dw.last_login > dt.datetime.now() - pd.to_timedelta("30day")]
dw = dw.rename({'user': 'user full name', 'title': 'user title'}
, axis='columns')
dw.drop_duplicates(subset ="Email",
keep = 'first', inplace = True)
return dw # send back to print statement
# get input from user
inpTitle = input ("choiceTitle? ")
inpTag = input ("PrimaryTagPreviousChoice? ")
# run function
result = GetUsers (inpTitle, inpTag)
print(result)
Try this. Save your input() as variables and use string concatenation to edit your mask. Note that an additional set of {} is needed for escaping.
choiceTitle = input('choiceTitle?')
PrimaryTagPreviousChoice = input('PrimaryTagPreviousChoice?')
mask = df2[(df2['choiceTitle'].str.contains("(?i){{0}}".format(choiceTitle))) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)
{{0}}".format(PrimaryTagPreviousChoice)))]
dw = mask[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
....

Categories