Iterating through lists with different lengths - python

I'm trying to iterate through 4 columns in a CSV that each contain a different amount sale ids.
I make a pandas dataframe and convert each row to a list.
If a column has a greater amount of sale ids than the following column it gives me an error:
Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/form[1]/div/select/option[#value=nan]"}
however if all columns have the same amount of id's each, the code works fine.
def get_report_data(self):
current_date = helpers.currentDate
data = pd.read_csv(r'C:\Users\rford\Desktop\sale_ids.csv')
everyone_ids = data['Everyone'].tolist()
dd_ids = data['Daily Deal'].tolist()
targeted_ids = data['Targeted'].tolist()
push_ids = data['Push Notification'].tolist()
acq_ids = data['Acquisition'].tolist()
for form_code, sales_type, idlist in (
( 1, "Everyone", everyone_ids ),
( 1, "Daily Deal", dd_ids ),
( 2, "Targeted", targeted_ids ),
( 2, "Push Notification", push_ids ),
( 2, "Acquisition", acq_ids ) ):
print('Gathering {} Sale Information'.format(sales_type))
for sale_id in idlist:
results = []
helpers.WebDriverWait(helpers.driver, 10)
helpers.driver.find_element_by_xpath('/html/body/form[{}]/div/select/option[#value={}]'.format(form_code, sale_id)).click()

The built-in function any might be useful in conjunction with each list's pop method:
def get_report_data(self):
current_date = helpers.currentDate
data = pd.read_csv(r'C:\Users\rford\Desktop\sale_ids.csv')
everyone_ids = data['Everyone'].tolist()
dd_ids = data['Daily Deal'].tolist()
targeted_ids = data['Targeted'].tolist()
push_ids = data['Push Notification'].tolist()
acq_ids = data['Acquisition'].tolist()
for form_code, sales_type, idlist in (
( 1, "Everyone", everyone_ids ),
( 1, "Daily Deal", dd_ids ),
( 2, "Targeted", targeted_ids ),
( 2, "Push Notification", push_ids ),
( 2, "Acquisition", acq_ids ) ):
print('Gathering {} Sale Information'.format(sales_type))
while any(idlist):
results = []
helpers.WebDriverWait(helpers.driver, 10)
helpers.driver.find_element_by_xpath(
'/html/body/form[{}]/div/select/option[#value={}]'.format(
form_code, idlist.pop(0)
)
).click()

Turns out pandas was reading some cells of the csv as float.
The fix ended up being to use .fillna(0) on my dataframe and then turn each column to a list and make them integers with .astype(int)
df = pd.read_csv(r'C:\Users\rford\Desktop\sale_ids.csv')
data = df.fillna(0)
everyone_ids = data['Everyone'].astype(int).tolist()
dd_ids = data['Daily Deal'].astype(int).tolist()
targeted_ids = data['Targeted'].astype(int).tolist()
push_ids = data['Push Notification'].astype(int).tolist()
acq_ids = data['Acquisition'].astype(int).tolist()

Related

Python dash return several values inside for loop

For my dash app, in order to update some graphs dynamically, I have to use a function that I named update_graphs inside a for loop. Some of the graphs contain several traces while some others only have one. The update_graphs function is called inside a callback and returns a dict and an int to update the extendData property of the graph object. However, since I am using a return statement inside a for loop, I only get the first trace.
I am not familiar with the generators and the yield keyword, maybe this is an option. But I haven't been able to make it work.
I have also tried to store the results of the update_graphs inside a list but it is not working.
Any help is appreciated!
Here is the code for the app:
import dash
from dash.dependencies import Output, Input, State, MATCH, ALL
from dash import dcc, html, ctx
import plotly
import plotly.express as px
import random
import plotly.graph_objs as go
import pandas as pd
# Initializing the data with the correct format
init_store = {}
n=3
init_df = pd.DataFrame({'a':pd.Series(dtype='int'), 'b':pd.Series(dtype='int'), 'c':pd.Series(dtype='int'), 'd':pd.Series(dtype='int')}, index=range(50))
init_df['a'] = init_df.index
init_store['0'] = init_df
for i in range(n):
init_df = pd.DataFrame({'a':pd.Series(dtype='int'), 'b':pd.Series(dtype='int')}, index=range(50))
init_df['a'] = init_df.index
init_store[f'{i+1}'] = init_df
# Function to update the dataframes with the new observations
def get_data(json_data):
df = pd.read_json(json_data)
compteur = df['a'][len(df['a'])-1]
if len(df.columns) > 2:
new_row = {'a':compteur + 1, 'b':random.randint(13,26), 'c':random.randint(13,26), 'd':random.randint(13,26)}
else:
new_row = {'a':compteur + 1, 'b':random.randint(13,26)}
df = df.shift(periods=-1)
df.iloc[len(df)-1] = new_row
return(df.to_json())
# Function to update the graphs based on the dataframes
def update_graphs(json_data, column, index=0):
df = pd.read_json(json_data)
nb_obs = df.shape[0]
x_new = df['a'][len(df)-1]
y_new = df[column][nb_obs-1]
return dict(x=[[x_new]], y=[[y_new]]), index
colors = px.colors.qualitative.G10
def generate_graph_containers(index, json_data):
dataframe = pd.read_json(json_data)
X = dataframe['a']
Y = dataframe.loc[:, dataframe.columns != 'a']
graph_id = {'type': 'graph-', 'index': index}
return(
html.Div(
html.Div(
dcc.Graph(
id=graph_id,
style={"height": "8rem"},
config={
"staticPlot": False,
"editable": False,
"displayModeBar": False,
},
figure=go.Figure(
{
"data": [
{
"x": list(X),
"y": list(Y[Y.columns[i]]),
"mode": "lines",
"name": Y.columns[i],
"line": {"color": colors[i+2]},
}
for i in range(len(Y.columns))
],
"layout": {
"uirevision": True,
"margin": dict(l=0, r=0, t=4, b=4, pad=0),
"xaxis": dict(
showline=False,
showgrid=False,
zeroline=False,
showticklabels=False,
),
"yaxis": dict(
showline=False,
showgrid=False,
zeroline=False,
showticklabels=False,
),
"paper_bgcolor": "rgba(0,0,0,0)",
"plot_bgcolor": "rgba(0,0,0,0)",
}
}
)
)
)
)
)
app = dash.Dash(__name__)
store = [dcc.Store(id={'type':'store-', 'index':i}, data=init_store[str(i)].to_json()) for i in range(n)]
def make_layout():
return(
html.Div(
[
html.Div(
store
),
dcc.Interval(
id = 'interval',
interval = 1000,
n_intervals = 0
),
html.Div(
[
generate_graph_containers(str(i), store[i].data) for i in range(n)
]
)
]
)
)
app.layout = make_layout
#app.callback(
Output(component_id={'type':'store-', 'index':MATCH}, component_property='data'),
[
Input('interval', 'n_intervals'),
State(component_id={'type':'store-', 'index':MATCH}, component_property='data')
]
)
def update_data(time, data):
return(get_data(data))
#app.callback(
Output(component_id={'type':'graph-', 'index':MATCH}, component_property='extendData'),
Input(component_id={'type':'store-', 'index':MATCH}, component_property="data")
)
def update_graphs_callback(data):
triggered_id = ctx.triggered_id
print(triggered_id['index'])
columns = ['b', 'c', 'd']
if triggered_id['index'] == 0:
for i in range(len(columns)):
return(update_graphs(data, columns[i], i))
else:
return(update_graphs(data, 'b'))
if __name__ == '__main__':
app.run_server(debug=True)
I figured it out. The trick is in the format expected to update the extendData property of a figure. When trying to update several traces, the format should be a dictionary with a key for the x values and one for the y values. The values associated should be an array for each key, containing an array per trace. Don't forget to add the trace indices after the dictionary. So for example, in the case of 3 distinct traces, the function should return something like:
dict(x=[[x_0], [x_1], [x_2]], y=[[y_0], [y_1], [y_2]]), [0, 1, 2]
Therefore the update_graphs function should be:
def update_graphs(json_data):
df = pd.read_json(json_data)
nb_obs = df.shape[0]
x_new = []
y_new = []
trace_index = []
for i in range(len(df.columns)-1):
x_new.append([df['a'][len(df)-1]])
y_new.append([df[df.columns[i+1]][nb_obs-1]])
trace_index.append(i)
return(dict(x=x_new, y=y_new), trace_index)
And the callback to update the graphs should be changed to:
#app.callback(
Output(component_id={'type':'graph-', 'index':MATCH}, component_property='extendData'),
Input(component_id={'type':'store-', 'index':MATCH}, component_property="data")
)
def update_graphs_callback(data):
return(update_graphs(data))

Iterows replacement for a calculation between each row of one dataframe to another

I'm trying to move away from iterows due to it's poor proformance. I can't however find another solution to comparing each row of one dataframe with each row from another dataframe.
I have two dataframes each containing a latitude and a longitude. Previously I have used these two functions to make a distance calculation between the two coordinates shown here:
def find_matches(first_HL, second_HL, N, M):
program_start = time.time()
matched_sites_df = pd.DataFrame()
for i_WP, r_WP in first_HL.iterrows():
series = pd.Series(dtype=float)
if r_WP['PL Name'] is not None and r_WP['PL Latitude'] is not None and r_WP['PL Longitude'] is not None:
series = name_and_distance_match(i_WP, r_WP, second_HL, N, M)
if series is not None:
series = pd.DataFrame(series.to_frame().T)
matched_sites_df = pd.concat([matched_sites_df, series], axis=0, ignore_index=True)
now = time.time()
print("------ MATCH FOUND ------ ", r_WP['PL Name'], "------", round(now - program_start, 2), "seconds")
return matched_sites_df
def calc_distance(r_WP, r_HL):
coords_1 = (r_WP['PL Latitude'], r_WP['PL Longitude'])
coords_2 = (r_HL['Latitude'], r_HL['Longitude'])
distance_km = round(geopy.distance.geodesic(coords_1, coords_2).km, 2)
return distance_km
def name_and_distance_match(i_WP, r_WP, second_HL, N, M):
for i_HL, r_HL in second_HL.iterrows():
if pd.isnull(r_HL['Site Name']) or pd.isnull(r_WP['PL Name']) == True:
pass
elif abs(r_WP['PL Latitude'] - r_HL['Latitude']) > 0.1:
pass
elif abs(r_WP['PL Longitude'] - r_HL['Longitude']) > 0.1:
pass
else:
distance_km = r_WP['Distance (km)'] = calc_distance(r_WP, r_HL)
if distance_km < M:
r_HL = filter_town(r_WP, r_HL)
score = r_WP['Name Similarity'] = np.vectorize(fuzzy)(r_HL["HL Site Short"], r_WP['PL Name'])
if score > N:
r_WP["HL Site Short"] = r_HL["HL Site Short"]
return r_WP
Is there a way I can do this without iterows?
The solution I'm working on at the moment looks like this:
def distance_check(first_HL, second_WPHL):
first_lat = first_HL["Latitude"]
first_long = second_WPHL["PL Longitude"]
second_lat = first_HL["Latitude"]
second_long = second_WPHL["PL Longitude"]
if abs(first_lat - second_lat) + abs(first_long - second_long) > 0.2:
return False
else:
COMBINED_HOUSELIST["WHATPUB Site Name"] = PUBMATCH_WHATPUB_SITES["Site Name"]
return True
PUBMATCH_WHATPUB_SITES
COMBINED_HOUSELIST["Distance Check"] = COMBINED_HOUSELIST.apply(distance_check(PUBMATCH_WHATPUB_SITES, COMBINED_HOUSELIST), axis=1)
Any help would be greatly appreciated, thank you.
EDIT: Example Dataframes
COMBINED_HOUSELIST = pd.DataFrame(np.array([["12345", "Wrexham Cwtch", "52.10", "-2.06"], ["12354", "Horse & Hound", "52.21", "-1.95"], ["12435", "Round Of Gras Badsey", "52.33", "-1.99"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
PUBMATCH_WHATPUB_SITES= pd.DataFrame(np.array([["52938", "Valkyrie Café Bar", "53.22", "-3.00"], ["12435", "Round Of Badsey", "52.33", "-1.99"], ["12345", "Cwtch", "52.11", "-2.00"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
Desired output
matched_sites = pd.DataFrame(np.array([["12345", "Wrexham Cwtch", "52.10", "-2.06"], ["12354", "Horse & Hound", "52.21", "-1.95"], ["12435", "Round Of Gras Badsey", "52.33", "-1.99"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
One way or another, I fear that you will have to resort to some form of iteration, but doing it outside of Pandas might speed things up.
So, here is one way to do it with map and partial functions from Python standard library.
First, define two helper functions:
from functools import partial
def calc_distance(coo1, coo2):
return abs(coo1[0] - coo2[0]) + abs(coo1[1] - coo2[1])
def find_matches(one_list, another_list, threshold):
idx = []
for coo in one_list:
func = partial(calc_distance, coo)
results = [result for result in map(func, another_list)]
idx.append([results.index(result) for result in results if result <= threshold])
return idx
Then, with the following toy dataframes:
import pandas as pd
import numpy as np
COMBINED_HOUSELIST = pd.DataFrame(
np.array(
[
["12345", "Wrexham Cwtch", "52.10", "-2.06"],
["12354", "Horse & Hound", "52.21", "-1.95"],
["12435", "Round Of Gras Badsey", "52.33", "-1.99"],
]
),
columns=["Site Number", "Site Name", "Longitude", "Latitude"],
)
PUBMATCH_WHATPUB_SITES = pd.DataFrame(
np.array(
[
["52938", "Valkyrie Café Bar", "53.22", "-3.00"],
["54999", "New Café Bar", "52.10", "-2.1"],
["12435", "Round Of Badsey", "52.33", "-1.99"],
["12345", "Cwtch", "52.11", "-2.00"],
]
),
columns=["Site Number", "Site Name", "Longitude", "Latitude"],
)
You can proceed like this:
# Setup
for col in ["Latitude", "Longitude"]:
for df in [COMBINED_HOUSELIST, PUBMATCH_WHATPUB_SITES]:
df[col] = pd.to_numeric(df[col])
# Get two lists of coordinates looking like [[lat, long], [lat, long],...]
CH_COO = COMBINED_HOUSELIST.loc[:, ["Latitude", "Longitude"]].to_dict("split")["data"]
PW_COO = PUBMATCH_WHATPUB_SITES.loc[:, ["Latitude", "Longitude"]].to_dict("split")[
"data"
]
# Look for matches
COMBINED_HOUSELIST = COMBINED_HOUSELIST.assign(match=find_matches(CH_COO, PW_COO, 0.1))
# Get site names
COMBINED_HOUSELIST["match"] = COMBINED_HOUSELIST.apply(
lambda x: [PUBMATCH_WHATPUB_SITES.loc[idx, "Site Name"] for idx in x["match"]],
axis=1,
)
Finally, print(COMBINED_HOUSELIST):

ValueError: (‘Lengths must match to compare’, (5854,), (0,))

Trying to create something based on the idea here DataTable Interactivity
However, getting error message
ValueError: (‘Lengths must match to compare’, (5854,), (0,))
Below are the code I tried:
dbc.Col([
html.P("Table:",
style={"textDecoration":"underline"}),
dbc.Col([
html.Table([
html.Td('Sub', id = '',style = header_column_cell_style), #title of the column
html.Td('', id = 'subtotal', style = body_column_cell_style) #data of the column
]),
dash_table.DataTable(id='table',
columns=[
{'name': 'Today', "id": 'Date'},
{'name': 'Product', "id": 'Product'},
{'name': 'Sale', "id": 'Sale'},
],
sort_action= 'native', #"custom",
sort_mode="multi",
filter_action = "native",
row_selectable = 'multi',
# selected_rows = []
# data=df.to_dict('records')
),
])
]),
])
# table
#app.callback(
Output('table', 'data'),
Input('date_dd', 'value')
)
def update_table(selection):
if len (selection) == 0 :
return dash.no_updates
else:
selection = datetime.strptime(selection, '%Y-%m-%d').date()
dff = df[df['Date'] == selection]
columns = dff[['Date', 'Product', 'Sale']]
data=columns.to_dict('records')
return data
#app.callback(
Output('subtotal', 'children'),
Input('table', 'derived_virtual_data'),
Input('table', 'derived_virtual_selected_rows'),
Input('date_dd', 'value')
)
def update_table(rows, derived_virtual_selected_rows, selection):
if derived_virtual_selected_rows is None and len (selection) == 0:
derived_virtual_selected_rows = []
dff = df[df['Date'] == selection ]
dff1 = dff if rows is None else pd.DataFrame(rows)
subt = dff1['Sale'].sum()
return subt
Anyone can assist?
Had tried this:
dff = df[df['Date'].eq(selection) ]
but getting another error message:
ValueError: Lengths must be equal

How to serialize the complex query (peewee)

I am using the peewee as ORM and my goal is to serialize the result of the complex query whcih also contains subqueries:
machine_usage_alias = RecordDailyMachineUsage.alias()
subquery = (
machine_usage_alias.select(
machine_usage_alias.machine_id,
fn.MAX(machine_usage_alias.date).alias('max_date'),
)
.group_by(machine_usage_alias.machine_id)
.alias('machine_usage_subquery')
)
record_subquery = RecordDailyMachineUsage.select(
RecordDailyMachineUsage.machine_id, RecordDailyMachineUsage.usage
).join(
subquery,
on=(
(RecordDailyMachineUsage.machine_id == subquery.c.machine_id)
& (RecordDailyMachineUsage.date == subquery.c.max_date)
),
)
query = (
Machine.select(
Machine.id, # 0
Machine.name,
Machine.location,
Machine.arch,
Machine.platform,
Machine.machine_version,
Machine.status,
record_subquery.c.usage.alias('usage'),
fn.GROUP_CONCAT(Tag.name.distinct()).alias('tags_list'),
fn.GROUP_CONCAT(Project.full_name.distinct()).alias('projects_list'),
) # 10
.join(MachineTag)
.join(Tag)
.switch(Machine)
.join(MachineProject)
.join(Project)
.join(
record_subquery,
JOIN.LEFT_OUTER,
on=(Machine.id == record_subquery.c.machine_id),
)
.where((Machine.id != 0) & (Machine.is_alive == 1))
.group_by(Machine.id)
)
I've tried to use the method model_to_dict:
jsonify({'rows': [model_to_dict(c) for c in query]})
But this way gives me the columns and and values from the Machine model only. My aim is include all the columns from the select query.
It turned out that I had to use the dicts method of the query and jsonify the result.
machine_usage_alias = RecordDailyMachineUsage.alias()
subquery = (
machine_usage_alias.select(
machine_usage_alias.machine_id,
fn.MAX(machine_usage_alias.date).alias('max_date'),
)
.group_by(machine_usage_alias.machine_id)
.alias('machine_usage_subquery')
)
record_subquery = RecordDailyMachineUsage.select(
RecordDailyMachineUsage.machine_id, RecordDailyMachineUsage.usage
).join(
subquery,
on=(
(RecordDailyMachineUsage.machine_id == subquery.c.machine_id)
& (RecordDailyMachineUsage.date == subquery.c.max_date)
),
)
query = (
Machine.select(
Machine.id, # 0
Machine.name,
Machine.location,
Machine.arch,
Machine.platform,
Machine.machine_version,
Machine.status,
record_subquery.c.usage.alias('usage'),
fn.GROUP_CONCAT(Tag.name.distinct()).alias('tags_list'),
fn.GROUP_CONCAT(Project.full_name.distinct()).alias('projects_list'),
) # 10
.join(MachineTag)
.join(Tag)
.switch(Machine)
.join(MachineProject)
.join(Project)
.join(
record_subquery,
JOIN.LEFT_OUTER,
on=(Machine.id == record_subquery.c.machine_id),
)
.where((Machine.id != 0) & (Machine.is_alive == 1))
.group_by(Machine.id)
).dicts()
return jsonify({'rows': [c for c in query]})

Group by column to get array results in Postgresql

I have a table called moviegenre which looks like:
moviegenre:
- movie (FK movie.id)
- genre (FK genre.id)
I have a query (ORM generated) which returns all movie.imdb and genre.id's which have genre.id's in common with a given movie.imdb_id.
SELECT "movie"."imdb_id",
"moviegenre"."genre_id"
FROM "moviegenre"
INNER JOIN "movie"
ON ( "moviegenre"."movie_id" = "movie"."id" )
WHERE ( "movie"."imdb_id" IN (SELECT U0."imdb_id"
FROM "movie" U0
INNER JOIN "moviegenre" U1
ON ( U0."id" = U1."movie_id" )
WHERE ( U0."last_ingested_on" IS NOT NULL
AND NOT ( U0."imdb_id" IN
( 'tt0169547' ) )
AND NOT ( U0."imdb_id" IN
( 'tt0169547' ) )
AND U1."genre_id" IN ( 2, 10 ) ))
AND "moviegenre"."genre_id" IN ( 2, 10 ) )
The problem is that I'll get results in the format:
[
('imdbid22`, 'genreid1'),
('imdbid22`, 'genreid2'),
('imdbid44`, 'genreid1'),
('imdbid55`, 'genreid8'),
]
Is there a way within the query itself I can group all of the genre ids into a list under the movie.imdb_id's? I'd like do to grouping in the query.
Currently doing it in my web app code (Python) which is extremely slow when 50k+ rows are returned.
[
('imdbid22`, ['genreid1', 'genreid2']),
('imdbid44`, 'genreid1'),
('imdbid55`, 'genreid8'),
]
thanks in advance!
edit:
here's the python code which runs against the current results
results_list = []
for item in movies_and_genres:
genres_in_common = len(set([
i['genre__id'] for i in movies_and_genres
if i['movie__imdb_id'] == item['movie__imdb_id']
]))
imdb_id = item['movie__imdb_id']
if genres_in_common >= min_in_comon:
result_item = {
'movie.imdb_id': imdb_id,
'count': genres_in_common
}
if result_item not in results_list:
results_list.append(result_item)
return results_list
select m.imdb_id, array_agg(g.genre_id) as genre_id
from
moviegenre g
inner join
movie m on g.movie_id = m.id
where
m.last_ingested_on is not null
and not m.imdb_id in ('tt0169547')
and not m.imdb_id in ('tt0169547')
and g.genre_id in (2, 10)
group by m.imdb_id
array_agg will create an array of all the genre_ids of a certain imdb_id:
http://www.postgresql.org/docs/current/interactive/functions-aggregate.html#FUNCTIONS-AGGREGATE-TABLE
I hope python code will be fast enough:
movielist = [
('imdbid22', 'genreid1'),
('imdbid22', 'genreid2'),
('imdbid44, 'genreid1'),
('imdbid55', 'genreid8'),
]
dict = {}
for items in movielist:
if dict[items[0]] not in dict:
dict[items[0]] = items[1]
else:
dict[items[0]] = dict[items[0]].append(items[1])
print dict
Output:
{'imdbid44': ['genreid1'], 'imdbid55': ['genreid8'], 'imdbid22': ['genreid1', 'genreid2']}
If you just need movie name, count:
Change this in original query you will get the answer you dont need python code
SELECT "movie"."imdb_id", count("moviegenre"."genre_id")
group by "movie"."imdb_id"

Categories