Reorder strings within object based on another data frame lookup - python

This is my first data frame (df1).
I have to reorder the elements in sequence column (of my df1) based on the minimum count from the second data frame (df2)
so my end results should be like this

I think the code below would do what you want. I've commented inline and put links for further reading...
I know it could be compressed into shorter code, but I wanted the steps to be clear.
import pandas as pd
from pprint import pprint
data1 = {'id': ['A1234', 'A2345'],
'Sequence': ['16 31 17', '51 59 43']}
df1 = pd.DataFrame(data1)
# I assumed the label en count columns are integers
data2 = {'label': [10, 11, 12, 13, 16, 17, 21, 24, 31, 43, 44, 51, 59, 60],
'count': [214, 128, 135, 37, 184, 68, 267, 264, 231, 13, 82, 100, 99, 92]}
df2 = pd.DataFrame(data2)
def seq_value_sort(seq_df, label_df):
new_sequence_list = []
for value in seq_df['Sequence'].values:
print(f'{"":-<40}') # prints a line
# convert string to list of integers
# https://www.geeksforgeeks.org/python-converting-all-strings-in-list-to-integers/
sequence = [int(i) for i in value.split()]
# generate an unsorted list of dict items based on Sequence
data = []
for index, row in label_df.T.iteritems():
if int(row['label']) in sequence:
data.append({'label': int(row['label']),
'count': int(row['count'])})
pprint(data)
# now sort the unsorted list based on key 'count'
# https://stackoverflow.com/a/73050/9267296
data = sorted(data, key=lambda k: k['count'])
pprint(data)
# list comprehension to make list of strings out
# of the list of dict
# https://stackoverflow.com/a/7271523/9267296
sequence_sorted = [ str(item['label']) for item in data ]
pprint(sequence_sorted)
# create the final sequence string from the list
new_sequence_list.append(' '.join(sequence_sorted))
# create return data
return_data = {'id': list(seq_df['id'].values),
'Sequence': new_sequence_list}
pprint(return_data)
# finally return a new df
return pd.DataFrame(return_data)
df3 = seq_value_sort(df1, df2)
print(f'{"":-<40}')
print(df3)
EDIT:
Forgot the output:
----------------------------------------
[{'count': 184, 'label': 16},
{'count': 68, 'label': 17},
{'count': 231, 'label': 31}]
[{'count': 68, 'label': 17},
{'count': 184, 'label': 16},
{'count': 231, 'label': 31}]
['17', '16', '31']
----------------------------------------
[{'count': 13, 'label': 43},
{'count': 100, 'label': 51},
{'count': 99, 'label': 59}]
[{'count': 13, 'label': 43},
{'count': 99, 'label': 59},
{'count': 100, 'label': 51}]
['43', '59', '51']
{'Sequence': ['17 16 31', '43 59 51'], 'id': ['A1234', 'A2345']}
----------------------------------------
id Sequence
0 A1234 17 16 31
1 A2345 43 59 51

Related

Complete a pandas data frame with values from other data frames

I have 3 data frames. I need to enrich the data from df with the data columns from df2 and df3 so that df ends up with the columns 'Code', 'Quantity', 'Payment', 'Date', 'Name', 'Size', 'Product','product_id', 'Sector'.
The codes that are in df and not in df2 OR df3, need to receive "unknown" for the string columns and "0" for the numeric dtype columns
import pandas as pd
data = {'Code': [356, 177, 395, 879, 952, 999],
'Quantity': [20, 21, 19, 18, 15, 10],
'Payment': [173.78, 253.79, 158.99, 400, 500, 500],
'Date': ['2022-06-01', '2022-09-01','2022-08-01','2022-07-03', '2022-06-09', '2022-06-09']
}
df = pd.DataFrame(data)
df['Date']= pd.to_datetime(df['Date'])
data2 = {'Code': [356, 177, 395, 893, 697, 689, 687],
'Name': ['John', 'Mary', 'Ann', 'Mike', 'Bill', 'Joana', 'Linda'],
'Product': ['RRR', 'RRT', 'NGF', 'TRA', 'FRT', 'RTW', 'POU'],
'product_id': [189, 188, 16, 36, 59, 75, 55],
'Size': [1, 1, 3, 4, 5, 4, 7],
}
df2 = pd.DataFrame(data2)
data3 = {'Code': [879, 356, 389, 395, 893, 697, 689, 978],
'Name': ['Mark', 'John', 'Marry', 'Ann', 'Mike', 'Bill', 'Joana', 'James'],
'Product': ['TTT', 'RRR', 'RRT', 'NGF', 'TRA', 'FRT', 'RTW', 'DTS'],
'product_id': [988, 189, 188, 16, 36, 59, 75, 66],
'Sector': ['rt' , 'dx', 'sx', 'da', 'sa','sd','ld', 'pc'],
}
df3 = pd.DataFrame(data3)
I was using the following code to obtain the unknown codes by comparing with df2, but now i have to compare with df3 also and also add the data from the columns ['Name', 'Size', 'Product','product_id', 'Sector'].
common = df2.merge(df,on=['Code'])
new_codes = df[(~df['Code'].isin(common['Code']))]

appending tuple of values to list in kafka consumer data

I have a kafka consumer which consumes 5000 data per second
Im receving the producer data in this format
producer sent data ['UUTO9QJ', (14, 77, 'bike', "{'lat': 14, 'long': 77, 'elevation': 900,
'bearing': 67, 'device': 'bike', 'batt_temp': 99, 'batt_voltage': 11.98, 'type': 'v1',
'device_id': 'JHVJ5HGJD'}")]
I have folllowing code:
#kafkaBus.handle('topic')
def logic_of_data(msg):
# kafkaBus = kafkaBus.run()
print(f" Consumer recivied data {msg}")
# print(ConsumerRecord[7])
l = msg.value
print(l[0])# l[0] holds this values UUTO9QJ
val = tuple(l[1]) # holds this value (13, 77, 'bike', "{'lat': 13, 'long': 77, 'elevation':
# 900, 'bearing': 67, 'device': 'bike', 'batt_temp': 99, 'batt_voltage': 11.98, 'type': 'v1',
# 'device_id': 'JHVJ5HGJD'}")
print(val, type(val))
datalist.append(val)
print(datalist,len(datalist))
datalist = []
tablefields = ['device_id', 'latitude', 'longitude', 'jsonval']
if len(datalist) == 5000:
print("Threshold reached")
if val := checktableExists(cur, conn, l[0]):
print("started procrssing")
Insertval(cur, l[0], tablefields, datalist, conn)
del datalist[:]
else:
print("create new table")
createtable(cur, conn, l[0])
Insertval(cur, l[0], tablefields, datalist, conn)
del datalist[:]
the value till 3 records are appened are coming correctly as of statement print(datalist,len(datalist))
[(12, 77, 'bike', "{'lat': 12, 'long': 77, 'elevation': 900, 'bearing': 67, 'device':
'bike', 'batt_temp': 99, 'batt_voltage': 11.98, 'type': 'v1', 'device_id': 'JHVJ5HGJD'}"),
(13, 77, 'bike', "{'lat': 13, 'long': 77, 'elevation': 900, 'bearing': 67, 'device': 'bike',
'batt_temp': 99, 'batt_voltage': 11.98, 'type': 'v1', 'device_id': 'JHVJ5HGJD'}"), (14, 77,
'bike', "{'lat': 14, 'long': 77, 'elevation': 900, 'bearing': 67, 'device': 'bike',
'batt_temp': 99, 'batt_voltage': 11.98, 'type': 'v1', 'device_id': 'JHVJ5HGJD'}")]
3
After that the count becomes 1
[(15, 77, 'bike', "{'lat': 15, 'long': 77, 'elevation': 900, 'bearing': 67, 'device':
'bike', 'batt_temp': 99, 'batt_voltage': 11.98, 'type': 'v1', 'device_id': 'JHVJ5HGJD'}")] 1
I want something that gives list with 5000 tuples of value l[1] in it. How can it be done

Appending and adding elements inside list which contains dictionaries as elements

There is a list which contains dictionaries as its elements. Each dictionary as multiple keys. I need to add values of other keys on the base of given key values.
for example, There is a list A:
A =
[{'count': 100, 'price': [100, 200, 300], 'quality': 'good', 'key': 'veg'}, {'count': 150, 'price': [10, 20, 30], 'quality': 'good', 'key': 'non-veg'}, {'count': 200, 'price': [1, 2, 3], 'quality': 'good', 'key': 'veg'}, {'count': 100, 'price': [110, 220, 330], 'quality': 'good', 'key': 'non-veg'}]
I am trying to add the values of these elements on the base of 'key' value. I need the output like:
[{'count': 300, 'price': [100, 200, 300, 1, 2, 3], 'quality': 'good', 'key': 'veg'}, {'count': 250, 'price': [10, 20, 30, 110, 220, 330], 'quality': 'good', 'key': 'non-veg'}]
I tried using itertools functions groupby and map. But NOT able to get the result exactly as expected. is there any easy way of doing this?
As much as I love groupby, I don't think is a good idea here. Your elements alternate veg and non-veg, but groupby expects the groups to be contiguous, which means it's only going to work if you first call sorted, at which point you're throwing away all the simplicity and performance benefits of doing things iteratively.
Meanwhile, without sorting, it'll be a lot easier to build up a dict, keyed off the key values, than a list that you have to keep searching for each key. For example:
d = collections.defaultdict(lambda: dict(count=0, price=[], quality=''))
for entry in A:
key = entry['key']
target = d[key]
target['count'] += entry['count']
target['price'].extend(entry['price'])
target['quality'] = 'something' # I don't know what your aggregation rule is
Now, d looks like this:
defaultdict(<function __main__.<lambda>>,
{'non-veg': {'count': 250,
'price': [10, 20, 30, 110, 220, 330]],
'quality': 'something'},
'veg': {'count': 300,
'price': [100, 200, 300, 1, 2, 3]],
'quality': 'something'}})
And if you really need the list at the end, that's easy:
[dict(key=key, **value) for key, value in d.items()]
Alternatively, if the dict structure turns out to be more useful than the list, just use that. (Use dict.setdefault instead of a defaultdict, or do d = dict(d) at the end, if you don't want KeyErrors on later lookups to turn into default values, of course.)
For a pure Python solution, collections.defaultdict is likely the best option.
If you are willing to use a 3rd party library, this is possible via Pandas:
import pandas as pd
import numpy as np
# create dataframe from list of dictionaries
df = pd.DataFrame(A)
print(df)
count key price quality
0 100 veg [100, 200, 300] good
1 150 non-veg [10, 20, 30] good
2 200 veg [1, 2, 3] good
3 100 non-veg [110, 220, 330] good
# define aggregation rules
agg_dict = {'price': lambda x: np.array(x.values.tolist()).sum(axis=0).tolist(),
'count': np.sum,
'quality': 'first'}
# apply aggregation rules
res = df.groupby('key').agg(agg_dict).reset_index()
print(res)
key price count quality
0 non-veg [120, 240, 360] 250 good
1 veg [101, 202, 303] 300 good
Then to produce your dictionary result:
d = res.to_dict(orient='records')
print(d)
[{'key': 'non-veg', 'price': [120, 240, 360], 'count': 250, 'quality': 'good'},
{'key': 'veg', 'price': [101, 202, 303], 'count': 300, 'quality': 'good'}]

Order a dictionary based on specific values

Before writing a function, I would like to be sure there is no pre-built (optimized) solution (like sorted()) that can:
From a dictionary like this one :
tags = {'pinoyako':{'likes': 119, 'comments': 11, 'count': 1}, 'dii':{'likes': 151, 'comments': 3, 'count': 1},'djiphantom3':{'likes': 127, 'comments': 6, 'count': 1}}
Order the keys based on 'likes', 'comments' or 'count'. If it's based on 'likes', the output should be a list ordered :
output = [['dii',151],['djiphantom3',127],['pinoyako',119]]
Use a generator expression within sorted() function with a proper key function:
In [22]: from operator import itemgetter
In [23]: sorted(((k, v['likes']) for k, v in tags.items()), key=itemgetter(1), reverse=True)
Out[23]: [('dii', 151), ('djiphantom3', 127), ('pinoyako', 119)]
You need more entries to illustrate. Check this:
tags = {'pinoyako':{'likes': 119, 'comments': 11, 'count': 1},
'pinoyako2':{'likes': 120, 'comments': 5, 'count': 2},
'djiphantom32':{'likes': 1275, 'comments': 61, 'count': 15},
'dii2':{'likes': 151, 'comments': 33, 'count': 13},
'dii':{'likes': 151, 'comments': 3, 'count': 1},
'djiphantom3':{'likes': 1275, 'comments': 61, 'count': 1}
}
tags_sorted = sorted(tags.items(), key=lambda x: (x[1]['likes'], x[1]['comments'], x[1]['count']))
tags_sorted
Output:
[('pinoyako', {'comments': 11, 'count': 1, 'likes': 119}),
('pinoyako2', {'comments': 5, 'count': 2, 'likes': 120}),
('dii', {'comments': 3, 'count': 1, 'likes': 151}),
('dii2', {'comments': 33, 'count': 13, 'likes': 151}),
('djiphantom3', {'comments': 61, 'count': 1, 'likes': 1275}),
('djiphantom32', {'comments': 61, 'count': 15, 'likes': 1275})]
Then you can do this:
tags_sorted = [[k, v['likes']] for k,v in tags_sorted]
tags_sorted
Output:
[['pinoyako', 119],
['pinoyako2', 120],
['dii', 151],
['dii2', 151],
['djiphantom3', 1275],
['djiphantom32', 1275]]

Convert dataframe to dictionary in Python

I have a csv file that I converted into dataframe using Pandas. Here's the dataframe:
Customer ProductID Count
John 1 50
John 2 45
Mary 1 75
Mary 2 10
Mary 5 15
I need an output in the form of a dictionary that looks like this:
{ProductID:1, Count:{John:50, Mary:75}},
{ProductID:2, Count:{John:45, Mary:10}},
{ProductID:5, Count:{John:0, Mary:15}}
I read the following answers:
python pandas dataframe to dictionary
and
Convert dataframe to dictionary
This is the code that I'm having:
df = pd.read_csv('customer.csv')
dict1 = df.set_index('Customer').T.to_dict('dict')
dict2 = df.to_dict(orient='records')
and this is my current output:
dict1 = {'John': {'Count': 45, 'ProductID': 2}, 'Mary': {'Count': 15, 'ProductID': 5}}
dict2 = [{'Count': 50, 'Customer': 'John', 'ProductID': 1},
{'Count': 45, 'Customer': 'John', 'ProductID': 2},
{'Count': 75, 'Customer': 'Mary', 'ProductID': 1},
{'Count': 10, 'Customer': 'Mary', 'ProductID': 2},
{'Count': 15, 'Customer': 'Mary', 'ProductID': 5}]
IIUC you can use:
d = df.groupby('ProductID').apply(lambda x: dict(zip(x.Customer, x.Count)))
.reset_index(name='Count')
.to_dict(orient='records')
print (d)
[{'ProductID': 1, 'Count': {'John': 50, 'Mary': 75}},
{'ProductID': 2, 'Count': {'John': 45, 'Mary': 10}},
{'ProductID': 5, 'Count': {'Mary': 15}}]

Categories