dask delayed loop with tuples - python

How can I properly use task delayed for a group-wise quotient calculation over multiple columns?
some sample data
raw_data = {
'subject_id': ['1', '2', '3', '4', '5'],
'name': ['A', 'B', 'C', 'D', 'E'],
'nationality': ['DE', 'AUT', 'US', 'US', 'US'],
'alotdifferent': ['x', 'y', 'z', 'x', 'a'],
'target': [0,0,0,1,1],
'age_group' : [1, 2, 1, 3, 1]}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'name', 'nationality', 'alotdifferent','target','age_group'])
df_a.nationality = df_a.nationality.astype('category')
df_a.alotdifferent = df_a.alotdifferent.astype('category')
df_a.name = df_a.name.astype('category')
some setup code which determines the string / categorical columns
FACTOR_FIELDS = df_a.select_dtypes(include=['category']).columns
columnsToDrop = ['alotdifferent']
columnsToBias_keep = FACTOR_FIELDS[~FACTOR_FIELDS.isin(columnsToDrop)]
target = 'target'
the main part: the calculation of the group-wise quotients
def compute_weights(da, colname):
# group only a single time
grouped = da.groupby([colname, target]).size()
# calculate first ratio
df = grouped / da[target].sum()
nameCol = "pre_" + colname
grouped_res = df.reset_index(name=nameCol)
grouped_res = grouped_res[grouped_res[target] == 1]
grouped_res = grouped_res.drop(target, 1)
# todo persist the result in dict for transformer
result_1 = grouped_res
return result_1, nameCol
And now actually calling it on multiple columns
original = df_a.copy()
output_df = original
ratio_weights = {}
for colname in columnsToBias_keep.union(columnsToDrop):
result_1, result_2, nameCol, nameCol_2 = compute_weights(original, colname)
# persist the result in dict for transformer
# this is required to separate fit and transform stage (later on in a sklearn transformer)
ratio_weights[nameCol] = result_1
ratio_weights[nameCol_2] = result_2
when trying to use dask delayed, I need to call compute which breaks the DAG. How can I curcumvent this, in order to create a single big computational graph which is calculated in parallel?
compute_weights = delayed(compute_weights)
a,b = delayed_res_name.compute()
ratio_weights = {}
ratio_weights[b] = a

Related

How To Assign Different Column Values To Different Variables In Python

I am trying to assign all the three unique groups from the group column in df to different variables (see my code) using Python. How do I incorporate this inside a for loop? Obviously var + i does not work.
import pandas as pd
data = {
'group': ['a', 'a', 'a', 'b', 'b', 'c', 'c'],
'num': list(range(7))
}
df = pd.DataFrame(data)
unique_groups = df['group'].unique()
# How do I incorporate this logic inside a for loop??
var1 = df[df['group'] == unique_groups[0]]
var2 = df[df['group'] == unique_groups[1]]
var3 = df[df['group'] == unique_groups[2]]
# My approach:
for i in range(len(unique_groups)):
var + i = df[df['group'] == unique_groups[i]] # obviously "var + i" does not work
From your comment it seems it is okay for all_vars to be a list so that all_vars[0] is the first group, all_vars[1] the second, etc. In that case, consider using groupby instead:
all_vars = [group for name, group in df.groupby("group")]
You can do this using a dictionary, basically:
all_vars ={}
for i in range(len(unique_groups)):
all_vars[f"var{i}"] = df[df['group'] == unique_groups[i]]

Function to replace values in columns with Column Headers (Pandas)

I am trying to create a function that loops through specific columns in a dataframe and replaces the values with the column names. I have tried the below but it does not change the values in the columns.
def value_replacer(df):
cols = ['Account Name', 'Account Number', 'Maintenance Contract']
x= [i for i in df.columns if i not in cols]
for i in x:
for j in df[i]:
if isinstance(j,str):
j.replace(j,i)
return df
What should be added to the function to change the values?
Similar to #lazy's solution, but using difference to get the unlisted columns and using a mask instead of the list comprehension:
df = pd.DataFrame({'w': ['a', 'b', 'c'], 'x': ['d', 'e', 'f'], 'y': [1, 2, '3'], 'z': [4, 5, 6]})
def value_replacer(df):
cols_to_skip = ['w', 'z']
for col in df.columns.difference(cols_to_skip):
mask = df[col].map(lambda x: isinstance(x, str))
df.loc[mask, col] = col
return df
Output:
Loop through only the columns of interest once, and only evaluate each row within each column to see if it is a string or not, then use the resulting mask to bulk update all strings with the column name.
Note that this will change the dataframe inplace, so make a copy if you want the original, and you don't necessarily need the return statement.

Find a column name and retaining certain string in that entire column values

I would like to format the "status" column in a csv and retain the string inside single quotation adjoining comma ('sometext',)
Example:
Input
as in row2&3 - if more than one values are found in any column values then it should be concatenated with a pipe symbol(|)Ex. Phone|Charger
Expected output should get pasted in same status column like below
My attempt (not working):
import pandas as pd
df = pd.read_csv("test projects.csv")
scol = df.columns.get_loc("Status")
statusRegex = re.
compile("'\t',"?"'\t',") mo = statusRegex.search (scol.column)
Let say you have df as :
df = pd.DataFrame([[[{'a':'1', 'b': '4'}]], [[{'a':'1', 'b': '2'}, {'a':'3', 'b': '5'}]]], columns=['pr'])
df:
pr
0 [{'a': '1', 'b': '4'}]
1 [{'a': '1', 'b': '2'}, {'a': '3', 'b': '5'}]
df['comb'] = df.pr.apply(lambda x: '|'.join([i['a'] for i in x]))
df:
pr comb
0 [{'a': '1', 'b': '4'}] 1
1 [{'a': '1', 'b': '2'}, {'a': '3', 'b': '5'}] 1|3
import pandas as pd
# simplified mock data
df = pd.DataFrame(dict(
value=[23432] * 3,
Status=[
[{'product.type': 'Laptop'}],
[{'product.type': 'Laptop'}, {'product.type': 'Charger'}],
[{'product.type': 'TV'}, {'product.type': 'Remote'}]
]
))
# make a method to do the desired formatting / extration of data
def da_piper(cell):
"""extracts product.type and concatenates with a pipe"""
vals = [_['product.type'] for _ in cell] # get only the product.type values
return '|'.join(vals) # join them with a pipe
# save to desired column
df['output'] = df['Status'].apply(da_piper) # apply the method to the Status col
Additional help: You do not need to use read_excel since csv is not an excel format. It is comma separated values which is a standard format. in this case you can just do this:
import pandas as pd
# make a method to do the desired formatting / extration of data
def da_piper(cell):
"""extracts product.type and concatenates with a pipe"""
vals = [_['product.type'] for _ in cell] # get only the product.type values
return '|'.join(vals) # join them with a pipe
# read csv to dataframe
df = pd.read_csv("test projects.csv")
# apply method and save to desired column
df['Status'] = df['Status'].apply(da_piper) # apply the method to the Status col
Thank you all for the help and suggestions. Please find the final working codes.
df = pd.read_csv('test projects.csv')
rows = len(df['input'])
def get_values(value):
m = re.findall("'(.+?)'",value)
word = ""
for mm in m:
if 'value' not in str(mm):
if 'autolabel_strategy' not in str(mm):
if 'String Matching' not in str(mm):
word += mm + "|"
return str(word).rsplit('|',1)[0]
al_lst =[]
ans_lst = []
for r in range(rows):
auto_label = df['autolabeledValues'][r]
answers = df['answers'][r]
al = get_values(auto_label)
ans = get_values(answers)
al_lst.append(al)
ans_lst.append(ans)
df['a'] = al_lst
df['b'] = ans_lst
df.to_csv("Output.csv",index=False)

Recursive reference of a dataframe column to the row

Consider the following program wherein, I created a multi-index dataframe with three columns and eventually populated one column with a nested list of tuple of lists. I the flattened the indexes and tried to iterate over the rows ix, rec = next(df.iterrows()).
I then de-referenced data column rec.data from the iterated row (rec), and found out it was a memory object <memory at 0x000000000D6E0AC8>. On calling the obj attributed on the record rec.data.obj, I realised it is an array with the content of the entire row. To get to the actual content, I have to fetch the item index which is quite non-intuitive.
>>> print(rec.data.obj[2])
[(['9', '"', 'X', '12', '"'], 0.9993008259451988)]
Sample Recreatable Example
def foo():
return [(['9', '"', 'X', '12', '"'], 0.99930082594519876)]
import pandas as pd
def spam():
index = pd.MultiIndex(levels=[[], []],
labels=[[], []],
names=[u'timestamp', u'key'])
columns = ['data', 'col1', 'col2']
df = pd.DataFrame(index=index, columns=columns)
for ix in range(4):
key = ('XXX', ix)
df.loc[key, 'data'] = str(foo())
df.loc[key, 'col1'] = "col1_{}".format(ix)
df.loc[key, 'col2'] = "col2_{}".format(ix)
df.reset_index(inplace=True)
return df
def bar():
df = spam()
ix, rec = next(df.iterrows())
print(rec.data)
print(rec.data.obj)
print(rec.data.obj[2])
bar()
Output
<memory at 0x000000000D6E0AC8>
['XXX' 0 '[([\'9\', \'"\', \'X\', \'12\', \'"\'], 0.9993008259451988)]'
'col1_0' 'col2_0']
[(['9', '"', 'X', '12', '"'], 0.9993008259451988)]
I am clueless and cannot understand, what am I missing
It seems you need itertuples:
def bar():
df = spam()
rec = next(df.itertuples())
print (rec)
print (rec.data)
bar()
Pandas(Index=0, timestamp='XXX',
key=0,
data='[([\'9\', \'"\', \'X\', \'12\', \'"\'], 0.9993008259451988)]',
col1='col1_0',
col2='col2_0')
[(['9', '"', 'X', '12', '"'], 0.9993008259451988)]

convert lists of uniform dicts into pandas Dataframe with nested dicts as multi-index

At a bit of loss despite much searching & experimentation...
Given this:
dictA = {'order': '1',
'char': {'glyph': 'A',
'case': 'upper',
'vowel': True}
}
dictB = {'order': '2',
'char': {'glyph': 'B',
'case': 'upper',
'vowel': False}
}
dictC = {'order': '3',
'char': {'glyph': 'C',
'case': 'upper',
'vowel': False}
}
dictD = {'order': '4',
'char': {'glyph': 'd',
'case': 'lower',
'vowel': False}
}
dictE = {'order': '5',
'char': {'glyph': 'e',
'case': 'lower',
'vowel': True}
}
letters = [dictA, dictB, dictC, dictD, dictE]
how to turn letters into into this: (first column is index)
order char
glyph case vowel
0 1 A upper True
1 2 B upper False
2 3 C upper False
3 4 d lower False
4 5 e lower True
... and as a plus, then be able operate on this frame to tally/plot number of entries that are uppercase, number of entries that are vowels, etc.
Any ideas?
EDIT: My initial example was maybe too simple, but I'll leave it for posterity.
Given:
import re
class Glyph(dict):
def __init__(self, glyph):
super(Glyph, self).__init__()
order = ord(glyph)
self['glyph'] = glyph
self['order'] = order
kind = {'type': None}
if re.search('\s+', glyph):
kind = {'type': 'whitespace'}
elif order in (range(ord('a'), ord('z')) +
range(ord('A'), ord('Z'))
):
lowercase = glyph.lower()
kind = {
'type': lowercase,
'vowel': lowercase in ['a', 'e', 'i', 'o', 'u'],
'case': ['upper', 'lower'][lowercase == glyph],
'number': (ord(lowercase) - ord('a') + 1)
}
self['kind'] = kind
chars = [Glyph(x) for x in 'Hello World']
I can do this:
import pandas as pd
df = pd.DataFrame(chars) # dataframe where 'order' & 'glyph' are OK...
# unpack 'kind' Series into list of dicts and use those to make a table
kindDf = pd.DataFrame(data=[x for x in df['kind']])
My intuition would lead me to think I could then do this:
df['kind'] = kindDf
...But that only adds the first column of my kindDF and puts it under 'kind' in df. Next attempt:
df.pop('kind') # get rid of this column of dicts
joined = df.join(kindDf) # flattens 'kind'...
joined is so close! The trouble is I want those columns from kind to be under a 'kind' hierarchy, rather than flat (as the joined result is). I've tried stack/unstack magic, but I can't grasp it. Do I need a MultiIndex?
This gets you close on the first part:
## a list for storing properly formated dataframes
container=[]
for l in letters:
## loop through list of dicts, turn each into a dataframe
## then add `order` to the index. Then make the dataframe wide using unstack
temp = pd.DataFrame(data=l).set_index('order',append=True).unstack(level=[0])
container.append(temp)
## throw all the dataframes together into one
result = pd.concat(container).reset_index()
result
order char
case glyph vowel
0 1 upper A True
1 2 upper B False
2 3 upper C False
3 4 lower d False
4 5 lower e True
For the second part, you can just rely on groupby and then the built in plotting functions for quick visuals. Omit the plot call after size() if you just want to see the tally.
result.groupby(result.char.vowel).size().plot(kind='bar',
figsize=[8,6])
title('Glyphs are awesome')

Categories