How to display apriori algorithm dataframe in flask - python

I make recommendations using the apriori algorithm, the results of the apriori algorithm are stored in the dataframe in the result variable. in Result there are 3 columns, namely the rule, support and confidence columns. Like below
Result=pd.DataFrame(columns=['Rule','Support','Confidence'])
for idx, elem in enumerate(association_results):
# print(elem)
thiselem = elem
# print("1 ", thiselem)
nextelem = association_results[(idx + 1) % len(association_results)]
# r1 = [x for x in thiselem[0]]
# r2 = [x for x in nextelem[0]]
# print("rule: ", r1[0], r2[0])
# print("sup: ", elem[1])
# print("conf: ", elem[2][0][2])
Result=Result.append({
'Rule':str([str(x) for x in thiselem[0]])+ " -> " +str([str(x) for x in nextelem[0]]),
'Support':str(round(elem[1] *100, 2))+'%',
'Confidence':str(round(elem[2][0][2] *100, 2))+'%'
},ignore_index=True)
I display the result of the dataframe in flask, this is the route
#app.route('/rekomendasi', methods=['POST'])
def rekomendasi():
sup = request.form.get('support')
conf = request.form.get('confidence')
# Model
store_data = pd.read_csv('dataPekerjaan.csv', sep=',', header=None, error_bad_lines=False)
records = []
# memisahkan data menjadi list
for i in range(store_data.shape[0]):
records.append([str(store_data.values[i, j]).split(',') for j in range(store_data.shape[1])])
# hanya ambil data nama pekerjaan
dataKerja = [[] for dataKerja in range(len(records))]
for i in range(len(records)):
for j in records[i][1]:
dataKerja[i].append(j)
# dataKerja
min_sup = float('0.00' + str(sup))
min_conf = float('0.00' + str(conf))
association_rules = apriori(dataKerja, min_support=min_sup, min_confidence=min_conf, min_length=2)
association_results = list(association_rules)
# menampilkan hasil asosiasi
pd.set_option('max_colwidth', 200)
result = pd.DataFrame(columns=['Rule', 'Support', 'Confidence'])
for item in association_results:
pair = item[2]
for i in pair:
result = result.append({
'Rule': str([x for x in i[0]]) + " -> " + str([x for x in i[1]]),
'Support': str(round(item[1] * 100, 2)) + '%',
'Confidence': str(round(i[2] * 100, 2)) + '%'
}, ignore_index=True)
return render_template('rekomendasi.html', name='made', sup = sup, conf = conf, len= len(result)-1, query = result)
and this is the rekomendasi.html
{%for i in range(len) %}
<tr>
<th>{{i+1}}</th>
<td>{{query['Rule'][i]}}</td>
<td>{{query['Support'][i]}}</td>
<td>{{query['Confidence'][i]}}</td>
</tr>
{%endfor%}
I wanted it to look like this without the [' '] in Rule column
But when the flask is run, this is how it looks like.
Is there anyway to fix this

You can use the join function to join items in a list with delimiter. Here the delimiter is , . So your code will be:
...
for item in association_results:
pair = item[2]
for i in pair:
result = result.append({
'Rule': ", ".join([x for x in i[0]]) + " -> " + ", ".join([x for x in i[1]]),
'Support': str(round(item[1] * 100, 2)) + '%',
'Confidence': str(round(i[2] * 100, 2)) + '%'
}, ignore_index=True)
...
This should replace the quotes and the brackets

Related

Rank the row based on the similar text using python?

How to rank the data frame based on the row value. i.e I have a row that contains text data want to provide the rank based on the similarity?
Expected output
i have tried with the levistian distance but not sure how can i do for the whole table
def bow(x=None):
x = x.lower()
words = x.split(' ')
words.sort()
x = ' '.join(words)
exclude = set('{}{}'.format(string.punctuation, string.digits))
x = ''.join(ch for ch in x if ch not in exclude)
x = '{} '.format(x.strip())
return x
#intents = load_intents(export=True)
df['bow'] = df['name'].apply(lambda x: bow(x))
df.sort_values(by='bow',ascending=True,inplace=True)
last_bow = ''
recs = []
for idx,row in df.iterrows():
record = {
'name': row['name'],
'bow': row['bow'],
'lev_distance': ed.eval(last_bow,row['bow'])
}
recs.append(record)
last_bow = row['bow']
intents = pd.DataFrame(recs,columns=['name','bow','lev_distance'])
l = intents[intents['lev_distance'] <= lev_distance_range]
r = []
for x in l.index.values:
r.append(x - 1)
r.append(x)
r = list(set(r))
l = intents.iloc[r,:]
Using textdistance, you could try this:
import pandas as pd
import textdistance
df = pd.DataFrame(
{
"text": [
"Rahul dsa",
"Rasul dsad",
"Raul ascs",
"shrez",
"Indya",
"Indi",
"shez",
"india",
"kloa",
"klsnsd",
],
}
)
df = (
df
.assign(
match=df["text"].map(
lambda x: [
i
for i, text in enumerate(df["text"])
if textdistance.jaro_winkler(x, text) >= 0.9
]
)
)
.sort_values(by="match")
.drop(columns="match")
)
print(df)
# Output
text
0 Rahul dsa
1 Rasul dsad
2 Raul ascs
3 shrez
6 shez
4 Indya
5 Indi
7 india
8 kloa
9 klsnsd

Why can i not print the results by sorting with my new list?

Can't get it to sort by new_list (total score) where I print the results. How can I make it sort by the new_list because when I print the new_list it prints out correctly.
new_list = []
for res in results:
varv_1 = res[1]
varv_2 = res[2]
varv_3 = res[3]
total = int(varv_1) + int(varv_2) + int(varv_3)
new_list.append(total)
new_list.sort()
print(new_list)
#Total
print("\nResultat")
print("**********")
print("{:<8}{:<8}{:<8}{:<8}{:<10}{:<10}".format("Namn", "1", "2", "3", "Totalt", "Genomsnitt"))
for result in results:
namn = result[0]
varv_1 = result[1]
varv_2 = result[2]
varv_3 = result[3]
total = int(varv_1) + int(varv_2) + int(varv_3)
avrg = round(total / 3 ,2)
print(f"{namn:<8}{varv_1:<8}{varv_2:<8}{varv_3:<8}{total:<10}{avrg:<10}", end="")
print("")
Make a list that contains both the scores and the totals, then sort it by the totals.
import operator
new_list = sorted([(res[0], sum(map(int, res[1:4])), *map(int, res[1:4])) for res in results], key=operator.itemgetter(1))
for namn, total, varv_1, varv_2, varv_3 in new_list:
avrg = round(total / 3 ,2)
print(f"{namn:<8}{varv_1:<8}{varv_2:<8}{varv_3:<8}{total:<10}{avrg:<10}")

How to iterate through two pandas columns and create a new column

I am trying to create a new column by concatenating two columns with certain conditions.
master['work_action'] = np.nan
for a,b in zip(master['repair_location'],master['work_service']):
if a == 'Field':
master['work_action'].append(a + " " + b)
elif a == 'Depot':
master['work_action'].append(a + " " + b)
else:
master['work_action'].append(a)
TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid
The problem is with master['work_action'].append(a + " " + b)
If I change my code to this:
test = []
for a,b in zip(master['repair_location'],master['work_service']):
if a == 'Field':
test.append(a + " " + b)
elif a == 'Depot':
test.append(a + " " + b)
else:
test.append(a)
I get exactly what I want in a list. But I want it in a pandas column. How do I create a new pandas column with the conditions above?
If performance is important, I would use numpy's select:
master = pd.DataFrame(
{
'repair_location': ['Field', 'Depot', 'Other'],
'work_service':[1, 2, 3]
}
)
master['work_action'] = np.select(
condlist= [
master['repair_location'] == 'Field',
master['repair_location'] == 'Depot'
],
choicelist= [
master['repair_location'] + ' ' + master['work_service'].astype(str),
master['repair_location'] + ' ' + master['work_service'].astype(str)
],
default= master['repair_location']
)
Which results in:
repair_location work_service work_action
0 Field 1 Field 1
1 Depot 2 Depot 2
2 Other 3 Other
Append method is for insert values at the end. You are trying to concatenate two strings values. Use apply method:
def fun(a,b):
if a == 'Field':
return a + " " + b
elif a == 'Depot':
return a + " " + b
else:
return a
master['work_action'] = master.apply(lambda x: fun(x['repair_location'], x['work_service']), axis=1)

Difference between masking and querying pandas.DataFrame

My example shows when using DataFrame of float that querying might in certains cases be faster than using masks. When you look at the graph, the q̶u̶e̶r̶y̶ ̶f̶u̶n̶c̶t̶i̶o̶n̶ ̶p̶e̶r̶f̶o̶r̶m̶s̶ ̶b̶e̶t̶t̶e̶r̶ ̶w̶h̶e̶n̶ ̶t̶h̶e̶ ̶c̶o̶n̶d̶i̶t̶i̶o̶n̶ ̶i̶s̶ ̶c̶o̶m̶p̶o̶s̶e̶d̶ ̶o̶f̶ ̶1̶ ̶t̶o̶ ̶5̶ ̶s̶u̶b̶c̶o̶n̶d̶i̶t̶i̶o̶n̶s̶.
Edit (thanks to a_guest): mask function performs better when the condition is composed of 1 to 5 subconditions
Then, Is there any difference between the two methods since it tends to have the same trend over the number of subconditions.
The function used to plot my data:
import matplotlib.pyplot as plt
def graph(data):
t = [int(i) for i in range(1, len(data["mask"]) + 1)]
plt.xlabel('Number of conditions')
plt.ylabel('timeit (ms)')
plt.title('Benchmark mask vs query')
plt.grid(True)
plt.plot(t, data["mask"], 'r', label="mask")
plt.plot(t, data["query"], 'b', label="query")
plt.xlim(1, len(data["mask"]))
plt.legend()
plt.show()
The functions used to creates the conditions to be tested by timeit:
def create_multiple_conditions_mask(columns, nb_conditions, condition):
mask_list = []
for i in range(nb_conditions):
mask_list.append("(df['" + columns[i] + "']" + " " + condition + ")")
return " & ".join(mask_list)
def create_multiple_conditions_query(columns, nb_conditions, condition):
mask_list = []
for i in range(nb_conditions):
mask_list.append(columns[i] + " " + condition)
return "'" + " and ".join(mask_list) + "'"
The function to benchmark masking vs querying using a pandas DataFrame containing float:
def benchmarks_mask_vs_query(dim_df=(50,10), labels=[], condition="> 0", random=False):
# init local variable
time_results = {"mask": [], "query": []}
nb_samples, nb_columns = dim_df
all_labels = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
if nb_columns > 26:
if len(labels) == nb_columns:
all_labels = labels
else:
raise Exception("labels length must match nb_columns" )
df = pd.DataFrame(np.random.randn(nb_samples, nb_columns), columns=all_labels[:nb_columns])
for col in range(nb_columns):
if random:
condition = "<" + str(np.random.random(1)[0])
mask = "df[" + create_multiple_conditions_mask(df.columns, col+1, condition) + "]"
query = "df.query(" + create_multiple_conditions_query(df.columns, col+1, condition) + ")"
print("Parameters: nb_conditions=" + str(col+1) + ", condition= " + condition)
print("Mask created: " + mask)
print("Query created: " + query)
print()
result_mask = timeit(mask, number=100, globals=locals()) * 10
result_query = timeit(query, number=100, globals=locals()) * 10
time_results["mask"].append(result_mask)
time_results["query"].append(result_query)
return time_results
What I run:
# benchmark on a DataFrame of shape(50,25) populating with random values
# as well as the conditions ("<random_value")
data = benchmarks_mask_vs_query((50,25), random=True)
graph(data)
What I get:

Convert a string into multiple JSON objects

I have a below string multiple lines. For each line, I want to split string and add this to a JSON output file. I had done this using string.gettext().split and a regular expression. However I am not sure this is the best way to do it.
Input file :
Server:prod01
Available memory: 20480 Disk:200 CPU:4
Used memory:12438 Disk:120 CPU:3
Unused memory:8042 Disk:80 CPU:1
Server:prod02
Available memory: 40960 Disk:500 CPU:8
Used memory:20888 Disk:320 CPU:3
Unused memory:20072 Disk:180 CPU:5
Expected output JSON:
{"prod01_available_memory":20480}
{"prod01_used_memory":12438}
{"prod01_unused_memory":8042}
{"prod01_available_disk":200}
{"prod01_used_disk":120}
{"prod01_unused_disk":80}
{"prod01_available_cpu":4}
{"prod01_used_cpu":3}
{"prod01_unused_cpu":1}
{"prod02_available_memory":40960}
{"prod02_used_memory":20888}
{"prod02_unused_memory":20072"}
{"prod02_available_disk":500"}
{"prod02_used_disk":380}
{"prod02_unused_disk":120}
{"prod02_available_cpu":8}
{"prod02_used_cpu":3}
{"prod02_unused_cpu":5}
Thanks,
Rinku
Below is my code -
def tsplit(string, *delimiters):
pattern = '|'.join(map(re.escape, delimiters))
return re.split(pattern, string)
prelist = pre.get_text().splitlines()
server_name = re.split('server|:',prelist[0])[2].strip()
if server_name == 'prod01':
#print prelist[1]
prod01_memory_actv = int(re.split('Activated memory|:|Disk|:|CPU|:',prelist[1])[2])
prod01_Disk_actv = int(re.split('Activated memory|:|Disk|:|CPU|:',prelist[1])[4])
prod01_CPU_actv = int(re.split('Activated memory|:|Disk|:|CPU|:',prelist[1])[6])
#print prelist[2]
prod01_memory_cons = int(re.split('memory consumed|:|Disk|:|CPU|:',prelist[2])[2])
prod01_Disk_cons = int(re.split('memory consumed|:|Disk|:|CPU|:',prelist[2])[4])
prod01_CPU_cons = int(re.split('memory consumed|:|Disk|:|CPU|:',prelist[2])[6])
#print prelist[4]
prod01_memory_unused = int(re.split('memory unused|:|Disk|:|CPU|:',prelist[4])[2])
prod01_Disk_unused = int(re.split('memory unused|:|Disk|:|CPU|:',prelist[4])[4])
prod01_CPU_unused = int(re.split('memory unused|:|Disk|:|CPU|:',prelist[4])[6])
elif server_name == 'prod02':
#print prelist[1]
prod02memory_actv = int(re.split('Activated memory|:|Disk|:|CPU|:',prelist[1])[2])
prod02Disk_actv = int(re.split('Activated memory|:|Disk|:|CPU|:',prelist[1])[4])
prod02CPU_actv = int(re.split('Activated memory|:|Disk|:|CPU|:',prelist[1])[6])
#print prelist[2]
prod02memory_cons = int(re.split('memory consumed|:|Disk|:|CPU|:',prelist[2])[2])
prod02Disk_cons = int(re.split('memory consumed|:|Disk|:|CPU|:',prelist[2])[4])
prod02CPU_cons = int(re.split('memory consumed|:|Disk|:|CPU|:',prelist[2])[6])
#print prelist[4]
prod02memory_unused = int(re.split('memory unused|:|Disk|:|CPU|:',prelist[4])[2])
prod02Disk_unused = int(re.split('memory unused|:|Disk|:|CPU|:',prelist[4])[4])
prod02CPU_unused = int(re.split('memory unused|:|Disk|:|CPU|:',prelist[4])[6])
else
#assign all varaiables 0
.....
proc_item["logtime"] = str(t1)
proc_item["prod01_memory_actv"] = prod01_memory_actv
proc_item["prod01_Disk_actv"] = prod01_Disk_actv
proc_item["prod01_CPU_actv"] = prod01_CPU_actv
......
#for all otehr variables...
proc_data.append(proc_item)
with open("./proc_"+ str(date.today()) + ".txt", 'a+') as f:
json.dump(proc_data, f)
f.write("\n")
I have some basic knowledge on python.
- Just using string array indices
hostmtrcs = "Server:prod01 Available memory:20480 Disk:200 CPU:4 Used memory:12438 Disk:120 CPU:3 Unused memory:8042 " \
"Disk:80 CPU:1 Server:prod02 Available memory: 40960 Disk:500 CPU:8 Used memory:20888 Disk:320 CPU:3 Unused " \
"memory:20072 Disk:180 CPU:5 "
datasplt = hostmtrcs.split(":")
hstname = ''
attrkey = ''
attrvalue = ''
for word in range(0, datasplt.__len__()):
if not datasplt[word].__contains__("Server"):
elmnt = datasplt[word].split(" ")
if datasplt[word].__contains__('prod'):
hstname = elmnt[0].lower()
if elmnt.__len__() == 3:
attrkey = elmnt[1].lower() + "_" + elmnt[2].lower() # attrkey
else:
attrkey = elmnt[1]
# retreive the value from the next element in the 1st attry datasplit
if word != datasplt.__len__() - 1:
nxtelmnt = datasplt[word + 1].split(" ")
attrvalue = nxtelmnt[0] # sattrvalue frm next element
finalfrmt = '{' + '"' +hstname + "_" + attrkey + '"' + ":" + attrvalue + '}'
print(finalfrmt)
I think you can do it with dict then just dump over json.(in your case i dont think its valid json but its needs so as per your request i have dump dict over json) i havn't validates keys, i am assuming you get dictionary data correct.
d = { 'Server':'prod01',
'Available memory': 20480,
'Disk':200,
'CPU':4}
import json
s = json.dumps({str(d['Server']+"_"+key).replace(' ','_'):value for key,value in d.items()})
print(json.loads(s))
>>> {'prod01_Server': 'prod01', 'prod01_Available memory': 20480, 'prod01_Disk': 200, 'prod01_CPU': 4}
You should split the input text, section by section, according to what you're looking for.
data = '''Server:prod01
Available memory: 20480 Disk:200 CPU:4
Used memory:12438 Disk:120 CPU:3
Unused memory:8042 Disk:80 CPU:1
Server:prod02
Available memory: 40960 Disk:500 CPU:8
Used memory:20888 Disk:320 CPU:3
Unused memory:20072 Disk:180 CPU:5'''
import re
import json
print(json.dumps({'_'.join((s, l.split(' ', 1)[0], k)).lower(): int(v) for s, d in [i.split('\n', 1) for i in data.split('Server:') if i] for l in d.split('\n') for k, v in re.findall(r'(\w+):\s*(\d+)', l)}))
This outputs:
{"prod01_available_memory": 20480, "prod01_available_disk": 200, "prod01_available_cpu": 4, "prod01_used_memory": 12438, "prod01_used_disk": 120, "prod01_used_cpu": 3, "prod01_unused_memory": 8042, "prod01_unused_disk": 80, "prod01_unused_cpu": 1, "prod02_available_memory": 40960, "prod02_available_disk": 500, "prod02_available_cpu": 8, "prod02_used_memory": 20888, "prod02_used_disk": 320, "prod02_used_cpu": 3, "prod02_unused_memory": 20072, "prod02_unused_disk": 180, "prod02_unused_cpu": 5}

Categories