How to structure complex function to apply to col of pandas df? - python

I have a large (>500k rows) pandas df like so
orig_df = pd.DataFrame(columns=list('id', 'free_text1', 'something_inert', 'free_text2'))
free_textX is a string field containing user input imported from a csv. The goal is to have a function func that does various checks on each row of free_textX and then a performs Levenshtein fuzzy text recognition based on the contents of another df reference. Something like
from rapidfuzz import process
LEVENSHTEIN_DIST = 25
def func(s) -> str:
if string == "25":
return s
elif s == "nothing":
return "something"
else:
s2 = process.extractOne(
query = s,
choices = reference['col_name'],
score_cutoff = LEVENSHTEIN_DIST
)
return s2
After this process a new column has to be inserted after free_textX called recog_textX containing the returned values from func.
I tried vectorization (for performance) like so
orig_df.insert(loc=new_col_index, #calculated before
column='recog_textX',
value=func(orig_df['free_textX'])
)
def func(series) -> pd.core.series.Series:
...
but I don't understand how to structure func (handling an entire df col as a series, by demand of vectorization, right?) as process.extractOne(...) -> str handles single strs instead of a series. Those interface concepts seem incompatible to me. But I do want to avoid a classic iteration here for performance reasons. My grasp of pandas is too shallow here. Help me out?

I may be missing a point, but you can use apply function to get what I think you want:
orig_df['recog_textX'] = orig_df['free_textX'].apply(func)
This will create a new column 'recog_textX' by applying your function func to each element of the 'free_textX' column.
Let me know if I misunderstood your question
As an aside, I do not think vectorizing this operation will make any difference speed-wise, given each application of func() is a complicated string operation. But it does look nicer than just looping through rows

Related

using if, elif, else logic in a function to define a dataframe column--why can't I use str.contains?

I want to create a dataframe column using a function using if/elif/else. A simplified example of my code is below. The main difference is the real code adds at least 20 different elif statements--hence, although I might be able to use nested np.where() or np.select(), I would prefer not to.
def func(row):
if row['condition']==True:
if row['summary'].str.contains('hi|there',case=False):
return 'hi_there'
else:
return 'Other'
else:
if row['summary'].str.contains('goodbye|you',case=False):
return 'goodbye_you'
else:
return 'Other'
df['newcolumn'] = df.apply(lambda row: func(row), axis=1)
I get this error message:
AttributeError: 'str' object has no attribute 'str'.
Is it possible to create my column using this method, but with a few additional tweaks? If it's not possible in Python, why?
The value stored in row at key 'summary' is a String, which has no attribute .str. Try getting rid of .str and see if it works.
You use the 'str' to access vectorized string functions on a column. With apply you iterate trough every row.
You can rewrite your function that you use within your apply. In this case you need something like this:
'hi' in row['summary'].lower()
or a regex as you did
import re
...
re.search('hi|there', row['summary'], re.IGNORECASE)
Better I think would be to actually use those vectorized functions as you started, just not within the apply, but on the dataframe columns directly
df['new_column'] = 'Other'
df.loc[(row['condition']==True)&(row['summary'].str.contains('hi|there',case=False)),'new_column'] = 'hi_there'
df.loc[(row['condition']==False)&(row['summary'].str.contains('goodbye|you',case=False)),'new_column'] = 'goodbye_you'

Python: Running function to append values to an empty list returns no values

This is probably a very basic question but I haven't been able to figure this out.
I'm currently using the following to append values to an empty list
shoes = {'groups':['running','walking']}
df_shoes_group_names = pd.DataFrame(shoes)
shoes_group_name=[]
for type in df_shoes_group_names['groups']:
shoes_group_name.append(type)
shoes_group_name
['running', 'walking']
I'm trying to accomplish the same using a for loop, however, when I execute the loop the list comes back as blank
shoes_group_name=[]
def list_builder(dataframe_name):
if 'shoes' in dataframe_name:
for type in df_shoes_group_names['groups']:
shoes_group_name.append(type)
list_builder(df_shoes_group_names)
shoes_group_name
[]
Reason for the function is that eventually I'll have multiple DF's with different product's so i'd like to just have if statements within the function to handle the creation of each list
so for example future examples could look like this:
df_shoes_group_names
df_boots_group_names
df_sandals_group_names
shoes_group_name=[]
boots_group_name=[]
sandals_group_name=[]
def list_builder(dataframe_name):
if 'shoes' in dataframe_name:
for type in df_shoes_group_names['groups']:
shoes_group_name.append(type)
elif 'boots' in dataframe_name:
for type in df_boots_group_names['groups']:
boots_group_name.append(type)
elif 'sandals' in dataframe_name:
for type in df_sandals_group_names['groups']:
sandals_group_name.append(type)
list_builder(df_shoes_group_names)
list_builder(df_boots_group_names)
list_builder(df_sandals_group_names)
Not sure if I'm approaching this the right way so any advice would be appreciated.
Best,
You should never call or search a variable name as if it were a string.
Instead, use a dictionary to store a variable number of variables.
Bad practice
# dataframes
df_shoes_group_names = pd.DataFrame(...)
df_boots_group_names = pd.DataFrame(...)
df_sandals_group_names = pd.DataFrame(...)
def foo(x):
if shoes in df_shoes_group_names: # <-- THIS WILL NOT WORK
# do something with x
Good practice
# dataframes
df_shoes_group_names = pd.DataFrame(...)
df_boots_group_names = pd.DataFrame(...)
df_sandals_group_names = pd.DataFrame(...)
dfs = {'shoes': df_shoes_group_names,
'boots': df_boots_group_names,
'sandals': df_sandals_group_names}
def foo(key):
if 'shoes' in key: # <-- THIS WILL WORK
# do something with dfs[key]

Error using map(): create new pandas column in respect to the value in another column

I am supposed to create a new pandas columns by comparing the values of this column ('% Renewable') to the median of the same column. And the result should make up a new column.
Of course I could use a for loop to do this. Though I am only at the beginning of my learning I want to make more use of the map, lambda etc methods.
Therefore I tried this:
def above(x,y):
if x>=y:
return 1
else:
return 0
def answer_ten():
Top15 = answer_one() #loads the dataframe and formats it
Median=Top15['% Renewable'].median()
Top15['HighRenew']=map(above, Top15['% Renewable'], Top15['% Renewable'].median()
# one try: list(map(above, (Top15['% Renewable'], Top15['% Renewable'].median())))
# one more try: [*map(above, (Top15['% Renewable'], Top15['% Renewable'].median()))]
return Top15['HighRenew']
But instead of the value I get an error: 'float' object is not iterable
I tried to alternatives that are liste in the comment line which I got from one other post here: Getting a map() to return a list in Python 3.x
By now I figured out a different one-line solution like this:
Top15['HighRenew']=(Top15['% Renewable']>=Top15['% Renewable'].median()).astype('int')
But I would like to know how I could have this differently (of course more lenghty) with Lambda or map() or filter(?).
Could anyone point me towards an alternative solution?
Thanks.
You probably just want above(Top15['% Renewable'], Top15['% Renewable'].median()). map takes a sequence of objects and applies the function to each one, but you only want to apply it once. The error you get is because the two values you pass in cannot be looped over.
So you basically want something like this:
Top15['HighRenew'] = Top15.apply(lambda df: int(df['% Renewable'] >= Top15['% Renewable'].median()))

Simplifying a list into categories

I am a new Python developer and was wondering if someone can help me with this. I have a dataset that has one column that describes a company type. I noticed that the column has, for example, surgical, surgery listed. It has eyewear, eyeglasses and optometry listed. So instead of having a huge list in this column, i want to simply the category to say that if you find a word that contains "eye," "glasses" or "opto" then just change it to "eyewear." My initial code looks like this:
def map_company(row):
company = row['SIC_Desc']
if company in 'Surgical':
return 'Surgical'
elif company in ['Eye', 'glasses', 'opthal', 'spectacles', 'optometers']:
return 'Eyewear'
elif company in ['Cotton', 'Bandages', 'gauze', 'tape']:
return 'First Aid'
elif company in ['Dental', 'Denture']:
return 'Dental'
elif company in ['Wheelchairs', 'Walkers', 'braces', 'crutches', 'ortho']:
return 'Mobility equipments'
else:
return 'Other'
df['SIC_Desc'] = df.apply(map_company,axis=1)
This is not correct though because it is changing every item into "Other," so clearly my syntax is wrong. Can someone please help me simplify this column that I am trying to relabel?
Thank you
It is hard to answer without having the exact content of your data set, but I can see one mistake. According to your description, it seems you are looking at this the wrong way. You want one of the words to be in your company description, so it should look like that:
if any(test in company for test in ['Eye', 'glasses', 'opthal', 'spectacles', 'optometers'])
However you might have a case issue here so I would recommend:
company = row['SIC_Desc'].lower()
if any(test.lower() in company for test in ['Eye', 'glasses', 'opthal', 'spectacles', 'optometers']):
return 'Eyewear'
You will also need to make sure company is a string and 'SIC_Desc' is a correct column name.
In the end your function will look like that:
def is_match(company,names):
return any(name in company for name in names)
def map_company(row):
company = row['SIC_Desc'].lower()
if 'surgical' in company:
return 'Surgical'
elif is_match(company,['eye','glasses','opthal','spectacles','optometers']):
return 'Eyewear'
elif is_match(company,['cotton', 'bandages', 'gauze', 'tape']):
return 'First Aid'
else:
return 'Other'
Here is an option using a reversed dictionary.
Code
import pandas as pd
# Sample DataFrame
s = pd.Series(["gauze", "opthal", "tape", "surgical", "eye", "spectacles",
"glasses", "optometers", "bandages", "cotton", "glue"])
df = pd.DataFrame({"SIC_Desc": s})
df
LOOKUP = {
"Eyewear": ["eye", "glasses", "opthal", "spectacles", "optometers"],
"First Aid": ["cotton", "bandages", "gauze", "tape"],
"Surgical": ["surgical"],
"Dental": ["dental", "denture"],
"Mobility": ["wheelchairs", "walkers", "braces", "crutches", "ortho"],
}
REVERSE_LOOKUP = {v:k for k, lst in LOOKUP.items() for v in lst}
def map_company(row):
company = row["SIC_Desc"].lower()
return REVERSE_LOOKUP.get(company, "Other")
df["SIC_Desc"] = df.apply(map_company, axis=1)
df
Details
We define a LOOKUP dictionary with (key, value) pairs of expected output and associated words, respectively. Note, the values are lowercase to simplify searching. Then we use a reversed dictionary to automatically invert the key value pairs and improve the search performance, e.g.:
>>> REVERSE_LOOKUP
{'bandages': 'First Aid',
'cotton': 'First Aid',
'eye': 'Eyewear',
'gauze': 'First Aid',
...}
Notice these reference dictionaries are created outside the mapping function to avoid rebuilding dictionaries for every call to map_company(). Finally the mapping function quickly returns the desired output using the reversed dictionary by calling .get(), a method that returns the default argument "Other" if no entry is found.
See #Flynsee's insightful answer for an explanation of what is happening in your code. The code is cleaner compared a bevy of conditional statements.
Benefits
Since we have used dictionaries, the search time should be relatively fast, O(1) compared to a O(n) complexity using in. Moreover, the main LOOKUP dictionary is adaptable and liberated from manually implementing extensive conditional statements for new entries.

Converting an imperative algorithm that "grows" a table into pure functions

My program, written in Python 3, has many places where it starts with a (very large) table-like numeric data structure and adds columns to it following a certain algorithm. (The algorithm is different in every place.)
I am trying to convert this into pure functional approach since I run into problems with the imperative approach (hard to reuse, hard to memoize interim steps, hard to achieve "lazy" computation, bug-prone due to reliance on state, etc.).
The Table class is implemented as a dictionary of dictionaries: the outer dictionary contains rows, indexed by row_id; the inner contains values within a row, indexed by column_title. The table's methods are very simple:
# return the value at the specified row_id, column_title
get_value(self, row_id, column_title)
# return the inner dictionary representing row given by row_id
get_row(self, row_id)
# add a column new_column_title, defined by func
# func signature must be: take a row and return a value
add_column(self, new_column_title, func)
Until now, I simply added columns to the original table, and each function took the whole table as an argument. As I'm moving to pure functions, I'll have to make all arguments immutable. So, the initial table becomes immutable. Any additional columns will be created as standalone columns and passed only to those functions that need them. A typical function would take the initial table, and a few columns that are already created, and return a new column.
The problem I run into is how to implement the standalone column (Column)?
I could make each of them a dictionary, but it seems very expensive. Indeed, if I ever need to perform an operation on, say, 10 fields in each logical row, I'll need to do 10 dictionary lookups. And on top of that, each column will contain both the key and the value, doubling its size.
I could make Column a simple list, and store in it a reference to the mapping from row_id to the array index. The benefit is that this mapping could be shared between all columns that correspond to the same initial table, and also once looked up once, it works for all columns. But does this create any other problems?
If I do this, can I go further, and actually store the mapping inside the initial table itself? And can I place references from the Column objects back to the initial table from which they were created? It seems very different from how I imagined a functional approach to work, but I cannot see what problems it would cause, since everything is immutable.
In general does functional approach frown on keeping a reference in the return value to one of the arguments? It doesn't seem like it would break anything (like optimization or lazy evaluation), since the argument was already known anyway. But maybe I'm missing something.
Here is how I would do it:
Derive your table class from a frozenset.
Each row should be a sublcass of tuple.
Now you can't modify the table -> immutability, great! The next step
could be to consider each function a mutation which you apply to the
table to produce a new one:
f T -> T'
That should be read as apply the function f on the table T to produce
a new table T'. You may also try to objectify the actual processing of
the table data and see it as an Action which you apply or add to the
table.
add(T, A) -> T'
The great thing here is that add could be subtract instead giving you
an easy way to model undo. When you get into this mindset, your code
becomes very easy to reason about because you have no state that can
screw things up.
Below is an example of how one could implement and process a table
structure in a purely functional way in Python. Imho, Python is not
the best language to learn about FP in because it makes it to easy to
program imperatively. Haskell, F# or Erlang are better choices I think.
class Table(frozenset):
def __new__(cls, names, rows):
return frozenset.__new__(cls, rows)
def __init__(self, names, rows):
frozenset.__init__(self, rows)
self.names = names
def add_column(rows, func):
return [row + (func(row, idx),) for (idx, row) in enumerate(rows)]
def table_process(t, (name, func)):
return Table(
t.names + (name,),
add_column(t, lambda row, idx: func(row))
)
def table_filter(t, (name, func)):
names = t.names
idx = names.index(name)
return Table(
names,
[row for row in t if func(row[idx])]
)
def table_rank(t, name):
names = t.names
idx = names.index(name)
rows = sorted(t, key = lambda row: row[idx])
return Table(
names + ('rank',),
add_column(rows, lambda row, idx: idx)
)
def table_print(t):
format_row = lambda r: ' '.join('%15s' % c for c in r)
print format_row(t.names)
print '\n'.join(format_row(row) for row in t)
if __name__ == '__main__':
from random import randint
cols = ('c1', 'c2', 'c3')
T = Table(
cols,
[tuple(randint(0, 9) for x in cols) for x in range(10)]
)
table_print(T)
# Columns to add to the table, this is a perfect fit for a
# reduce. I'd honestly use a boring for loop instead, but reduce
# is a perfect example for how in FP data and code "becomes one."
# In fact, this whole program could have been written as just one
# big reduce.
actions = [
('max', max),
('min', min),
('sum', sum),
('avg', lambda r: sum(r) / float(len(r)))
]
T = reduce(table_process, actions, T)
table_print(T)
# Ranking is different because it requires an ordering, which a
# table does not have.
T2 = table_rank(T, 'sum')
table_print(T2)
# Simple where filter: select * from T2 where c2 < 5.
T3 = table_filter(T2, ('c2', lambda c: c < 5))
table_print(T3)

Categories