Complete a pandas data frame with values from other data frames - python

I have 3 data frames. I need to enrich the data from df with the data columns from df2 and df3 so that df ends up with the columns 'Code', 'Quantity', 'Payment', 'Date', 'Name', 'Size', 'Product','product_id', 'Sector'.
The codes that are in df and not in df2 OR df3, need to receive "unknown" for the string columns and "0" for the numeric dtype columns
import pandas as pd
data = {'Code': [356, 177, 395, 879, 952, 999],
'Quantity': [20, 21, 19, 18, 15, 10],
'Payment': [173.78, 253.79, 158.99, 400, 500, 500],
'Date': ['2022-06-01', '2022-09-01','2022-08-01','2022-07-03', '2022-06-09', '2022-06-09']
}
df = pd.DataFrame(data)
df['Date']= pd.to_datetime(df['Date'])
data2 = {'Code': [356, 177, 395, 893, 697, 689, 687],
'Name': ['John', 'Mary', 'Ann', 'Mike', 'Bill', 'Joana', 'Linda'],
'Product': ['RRR', 'RRT', 'NGF', 'TRA', 'FRT', 'RTW', 'POU'],
'product_id': [189, 188, 16, 36, 59, 75, 55],
'Size': [1, 1, 3, 4, 5, 4, 7],
}
df2 = pd.DataFrame(data2)
data3 = {'Code': [879, 356, 389, 395, 893, 697, 689, 978],
'Name': ['Mark', 'John', 'Marry', 'Ann', 'Mike', 'Bill', 'Joana', 'James'],
'Product': ['TTT', 'RRR', 'RRT', 'NGF', 'TRA', 'FRT', 'RTW', 'DTS'],
'product_id': [988, 189, 188, 16, 36, 59, 75, 66],
'Sector': ['rt' , 'dx', 'sx', 'da', 'sa','sd','ld', 'pc'],
}
df3 = pd.DataFrame(data3)
I was using the following code to obtain the unknown codes by comparing with df2, but now i have to compare with df3 also and also add the data from the columns ['Name', 'Size', 'Product','product_id', 'Sector'].
common = df2.merge(df,on=['Code'])
new_codes = df[(~df['Code'].isin(common['Code']))]

Related

Populate a column ONLY if there is a value on another column

I am trying to get a specific value in one column only if the value on the other column is not blank. My current code returns the value for all the column regardless of if its blank or not.
Minimal reproducible example:
df=pd.DataFrame(
{
'col1': [12, 34, 54, 23, 12, 43, 53, 12, 43, 12],
'col2': ['USD', 'CAD', 'USD', 'USD', 'CAD', 'USD', 'USD',
'EURO', 'USD', 'USD'],
}
)
Here's my code:
# Set date/time reply and empty column
def add_days_to_date(date, days):
subtracted_date = pd.to_datetime(date) + timedelta(days=days)
subtracted_date = subtracted_date.strftime("%m-%d")
return subtracted_date
def replied_sent_date(date):
return f"Opened {date} need update {add_days_to_date(date, 1)}"
date = datetime.date.today()
date_format = date.strftime('%m-%d')
df['col3'] = df.apply(lambda _:" ", axis=1)
df.loc[df['col1'] != '', 'col3'] = f"Opened {date_format} will need an update {add_days_to_date(date, 30)}"
Desired output:
df=pd.DataFrame(
{
'col1': [12, 34, 54, 23, 12, 43, 53, 12, 43, 12],
'col2': ['USD', 'CAD', 'USD', 'USD', 'CAD', 'USD', 'USD',
'EURO', 'USD', 'USD'],
'col3': [*Look at note below*]
}
)
** "col3" would output "f"Opened {date_format} will need an update {add_days_to_date(date, 30)}" for every cell in which there exists a value for col1.
Any help is appreciated, thanks.

Reorder strings within object based on another data frame lookup

This is my first data frame (df1).
I have to reorder the elements in sequence column (of my df1) based on the minimum count from the second data frame (df2)
so my end results should be like this
I think the code below would do what you want. I've commented inline and put links for further reading...
I know it could be compressed into shorter code, but I wanted the steps to be clear.
import pandas as pd
from pprint import pprint
data1 = {'id': ['A1234', 'A2345'],
'Sequence': ['16 31 17', '51 59 43']}
df1 = pd.DataFrame(data1)
# I assumed the label en count columns are integers
data2 = {'label': [10, 11, 12, 13, 16, 17, 21, 24, 31, 43, 44, 51, 59, 60],
'count': [214, 128, 135, 37, 184, 68, 267, 264, 231, 13, 82, 100, 99, 92]}
df2 = pd.DataFrame(data2)
def seq_value_sort(seq_df, label_df):
new_sequence_list = []
for value in seq_df['Sequence'].values:
print(f'{"":-<40}') # prints a line
# convert string to list of integers
# https://www.geeksforgeeks.org/python-converting-all-strings-in-list-to-integers/
sequence = [int(i) for i in value.split()]
# generate an unsorted list of dict items based on Sequence
data = []
for index, row in label_df.T.iteritems():
if int(row['label']) in sequence:
data.append({'label': int(row['label']),
'count': int(row['count'])})
pprint(data)
# now sort the unsorted list based on key 'count'
# https://stackoverflow.com/a/73050/9267296
data = sorted(data, key=lambda k: k['count'])
pprint(data)
# list comprehension to make list of strings out
# of the list of dict
# https://stackoverflow.com/a/7271523/9267296
sequence_sorted = [ str(item['label']) for item in data ]
pprint(sequence_sorted)
# create the final sequence string from the list
new_sequence_list.append(' '.join(sequence_sorted))
# create return data
return_data = {'id': list(seq_df['id'].values),
'Sequence': new_sequence_list}
pprint(return_data)
# finally return a new df
return pd.DataFrame(return_data)
df3 = seq_value_sort(df1, df2)
print(f'{"":-<40}')
print(df3)
EDIT:
Forgot the output:
----------------------------------------
[{'count': 184, 'label': 16},
{'count': 68, 'label': 17},
{'count': 231, 'label': 31}]
[{'count': 68, 'label': 17},
{'count': 184, 'label': 16},
{'count': 231, 'label': 31}]
['17', '16', '31']
----------------------------------------
[{'count': 13, 'label': 43},
{'count': 100, 'label': 51},
{'count': 99, 'label': 59}]
[{'count': 13, 'label': 43},
{'count': 99, 'label': 59},
{'count': 100, 'label': 51}]
['43', '59', '51']
{'Sequence': ['17 16 31', '43 59 51'], 'id': ['A1234', 'A2345']}
----------------------------------------
id Sequence
0 A1234 17 16 31
1 A2345 43 59 51

How to combine two (or morer) DF w/ different leng and ind providing an appropriate index for both in a single DF

I have two dataframes (DF and DF2). Anyobody could help me in understand how can I combine these two dataframes and make them look like this third one (DF3)? I presented a simple example, but I need this to compile dataframes that include different samples (or observations). Eventually, there are samples that emcompass the same group of variables. But most of the cases, the samples present different variables. Each column corresponds to one sample.
Any help is welcome!
DF -
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
print(df)
DF2 -
raw_data2 = {'first_name': ['Molly', 'Jake'],
'civil_status': ['Single', 'Single']}
df2 = pd.DataFrame(raw_data2, columns = ['first_name', 'civil_status'])
print(df2)ยดยดยด
DF3 -
raw_data3 = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70],
'civil_status': ['NaN', 'Single', 'NaN', 'Single', 'NaN']}
df3 = pd.DataFrame(raw_data3, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore',
'civil_status'])
print(df3)
join
df.set_index("first_name").join(df2.set_index("first_name"))
I applied the solution above in the real context, by using the code below:
arquivo1 = pd.read_excel(f1, header=7, index_col=False)
arquivo2 = pd.read_excel(f2, header=7, index_col=False)
joined = arquivo1.set_index("Element").join(arquivo2.set_index("Element"))
It provided ValueError: columns overlap but no suffix specified: Index(['AN', 'series', ' Net', ' [wt.%]', ' [norm. wt.%]', '[norm. at.%]',
'Error in %'],
dtype='object')
The pictures below represent "arquivo1" and "arquivo2"
arquivo1
arquivo2
When I include the suffix 'Element' in the right and left, it actually join the both dataframe.
joined = arquivo1.set_index("Element").join(arquivo2.set_index("Element"), lsuffix='Element', rsuffix='Element')
But when a dataframe containing more variables (lines) is joined to the first, it simply delete the new variables. Anybody know how to fix it?

Nested Lists to a nested dictionary

This is my first post and I have some problems with my code.
I need to transform my objects list:
mylist=[
[length, 1322, width, 850, high, 620, type, sedan, e, 55, f, 44],
[length, 1400, width, 922, high, 650, type, truck, e, 85, f, 50]
]
Into a dictionary like this:
mydic = {
sedan : {length : 1322, width : 850, high : 620, type : sedan, e : 55, f : 44},
truck : {length : 1400, width : 922, high : 650, type : truck, e : 85, f : 50}
}
I do not know how to do it...
Thanks in Advance!
lista=[["length", 1322, "width", 850, "high", 620, "type", "sedan", "e", 55, "f", 44], ["length", 1400, "width", 922, "high", 650, "type", "truck", "e", 85, "f", 50]]
b=[{q:i[w*2+1] for w,q in enumerate(i[::2])} for i in lista] # Goes through the list and put the key and keyvalue together in the future nested dict
c={i["type"]:i for i in b} #creates the out dict now that it can call for the type of the dict, and assign type to dict
it is slightly ineficient if you list gets bigger but it is a solution.
The output for c btw. is:
{'sedan': {'length': 1322, 'width': 850, 'high': 620, 'type': 'sedan', 'e': 55, 'f': 44}, 'truck': {'length': 1400, 'width': 922, 'high': 650, 'type': 'truck', 'e': 85, 'f': 50}}
i also took the liberty to turn your variables in to strings. Assumed you forgot to put them on :)
First we make the key/value pairs, then we turn those into dictionaries. Then you can use the type entry of those dictionaries to put them in the nested dictionary
def pairs(seq):
it = iter(seq)
return zip(it, it)
mylist=[['length', 1322, 'width', 850, 'high', 620, 'type', 'sedan', 'e', 55, 'f', 44], ['length', 1400, 'width', 922, 'high', 650, 'type', 'truck', 'e', 85, 'f', 50]]
dicts = map(dict, map(pairs, mylist))
result = {d['type']: d for d in dicts}
print(result)
# {'sedan': {'length': 1322, 'width': 850, 'high': 620, 'type': 'sedan', 'e': 55, 'f': 44}, 'truck': {'length': 1400, 'width': 922, 'high': 650, 'type': 'truck', 'e': 85, 'f': 50}}

Pandas: How to avoid nested for loop

I have some code that compares actual data to target data, where the actual data lives in one DataFrame and the target in another. I need to look up the target, bring it into the df with the actual data, and then compare the two. In the simplified example below, I have a set of products and a set of locations all with unique targets.
I'm using a nested for loop to pull this off: looping through the products and then the locations. The problem is that my real life data is larger on all dimensions, and it takes up an inordinate amount of time to loop through everything.
I've looked at various SO articles and none (that I can find!) seem to be related to pandas and/or relevant for my problem. Does anyone have a good idea on how to vectorize this code?
import pandas as pd
import numpy as np
import time
employee_list = ['Joe', 'Bernie', 'Elizabeth', 'Kamala', 'Cory', 'Pete',
'Amy', 'Andrew', 'Beto', 'Jay', 'Kristen', 'Julian',
'Mike', 'John', 'Tulsi', 'Tim', 'Eric', 'Seth', 'Howard',
'Bill']
location_list = ['Denver', 'Boulder', 'Phoenix', 'Reno', 'Portland',
'Eugene', 'San Francisco']
product_list = ['Product1', 'Product2', 'Product3', 'Product4', 'Product5']
tgt_data = {'Location' : location_list,
'Product1' : [600, 200, 750, 225, 450, 175, 900],
'Product2' : [300, 100, 350, 125, 200, 90, 450],
'Product3' : [700, 250, 950, 275, 600, 225, 1200],
'Product4' : [200, 100, 250, 75, 150, 75, 300],
'Product5' : [900, 300, 1000, 400, 600, 275, 1300]}
tgt_df = pd.DataFrame(data = tgt_data)
employee_data = {'Employee' : employee_list,
'Location' : ['Boulder', 'Denver', 'Portland', 'Denver',
'San Francisco', 'Phoenix', 'San Francisco',
'Eugene', 'San Francisco', 'Reno', 'Denver',
'Phoenix', 'Denver', 'Portland', 'Reno',
'Boulder', 'San Francisco', 'Phoenix',
'San Francisco', 'Phoenix'],
'Product1' : np.random.randint(1, 1000, 20),
'Product2' : np.random.randint(1, 700, 20),
'Product3' : np.random.randint(1, 1500, 20),
'Product4' : np.random.randint(1, 500, 20),
'Product5' : np.random.randint(1, 1500, 20)}
emp_df = pd.DataFrame(data = employee_data)
start = time.time()
for p in product_list:
for l in location_list:
emp_df.loc[emp_df['Location'] == l, p + '_tgt'] = (
tgt_df.loc[tgt_df['Location']==l, p].values)
emp_df[p + '_pct'] = emp_df[p] / emp_df[p + '_tgt']
print(emp_df)
end = time.time()
print(end - start)
If the target dataframe is guaranteed to have unique locations, you can use a join to make this process really quick.
import pandas as pd
import numpy as np
import time
employee_list = ['Joe', 'Bernie', 'Elizabeth', 'Kamala', 'Cory', 'Pete',
'Amy', 'Andrew', 'Beto', 'Jay', 'Kristen', 'Julian',
'Mike', 'John', 'Tulsi', 'Tim', 'Eric', 'Seth', 'Howard',
'Bill']
location_list = ['Denver', 'Boulder', 'Phoenix', 'Reno', 'Portland',
'Eugene', 'San Francisco']
product_list = ['Product1', 'Product2', 'Product3', 'Product4', 'Product5']
tgt_data = {'Location' : location_list,
'Product1' : [600, 200, 750, 225, 450, 175, 900],
'Product2' : [300, 100, 350, 125, 200, 90, 450],
'Product3' : [700, 250, 950, 275, 600, 225, 1200],
'Product4' : [200, 100, 250, 75, 150, 75, 300],
'Product5' : [900, 300, 1000, 400, 600, 275, 1300]}
tgt_df = pd.DataFrame(data = tgt_data)
employee_data = {'Employee' : employee_list,
'Location' : ['Boulder', 'Denver', 'Portland', 'Denver',
'San Francisco', 'Phoenix', 'San Francisco',
'Eugene', 'San Francisco', 'Reno', 'Denver',
'Phoenix', 'Denver', 'Portland', 'Reno',
'Boulder', 'San Francisco', 'Phoenix',
'San Francisco', 'Phoenix'],
'Product1' : np.random.randint(1, 1000, 20),
'Product2' : np.random.randint(1, 700, 20),
'Product3' : np.random.randint(1, 1500, 20),
'Product4' : np.random.randint(1, 500, 20),
'Product5' : np.random.randint(1, 1500, 20)}
emp_df = pd.DataFrame(data = employee_data)
With the setup done, we can now use our join.
product_tgt_cols = [product+'_tgt' for product in product_list]
print(product_tgt_cols) #['Product1_tgt', 'Product2_tgt', 'Product3_tgt', 'Product4_tgt', 'Product5_tgt']
product_pct_cols = [product+'_pct' for product in product_list]
print(product_pct_cols) #['Product1_pct', 'Product2_pct', 'Product3_pct', 'Product4_pct', 'Product5_pct']
start = time.time()
#join on location to get _tgt columns
emp_df = emp_df.join(tgt_df.set_index('Location'), on='Location', rsuffix='_tgt')
#divide the entire product arrays using numpy, store in temp
temp = emp_df[product_list].values/emp_df[product_tgt_cols].values
#create a new temp df for the _pct results, and assign back to emp_df
emp_df = emp_df.assign(**pd.DataFrame(temp, columns = product_pct_cols))
print(emp_df)
end = time.time()
print("with join: ",end - start)
You are having "wide format" dataframes. I feel "long format" easier to manipulate.
# turn emp_df into long
# indexed by "Employee", "Location", and "Product"
emp_df = (emp_df.set_index(['Employee', 'Location'])
.stack().to_frame())
emp_df.head()
0
Employee Location
Joe Boulder Product1 238
Product2 135
Product3 873
Product4 153
Product5 373
# turn tmp_df into a long series
# indexed by "Location" and "Product"
tgt_df = tgt_df.set_index('Location').stack()
tgt_df.head()
# set target for employees by locations:
emp_df['target'] = (emp_df.groupby('Employee')[0]
.apply(lambda x: tgt_df))
# percentage
emp_df['pct'] = emp_df[0]/emp_df['target']
# you can get the wide format back by
# emp_df = emp_df.unstack(level=2)
# which will give you a dataframe with
# multi-level index and multi-level column

Categories