iterating over each row in pandas to evaluate condition

iterating over each row in pandas to evaluate condition - python

I have the following code
import pandas as pd
from pandas_datareader import data as web
import numpy as np
import math
data = web.DataReader('goog', 'yahoo')
df['lifetime'] = data['High'].asfreq('D').rolling(window=999999, min_periods=1).max() #To check if it is a lifetime high
How can i compare it so that i get a boolean (in 1 and 0 preferably) if df['High'] is close to its df['lifetime'] for each row in pandas:
data['isclose'] = math.isclose(data['High'], data['lifetime'], rel_tol = 0.003)
Any help would be appreciated.

You can use np.where()
import numpy as np
import math
data['isclose'] = np.where(math.isclose(data['High'], data['lifetime'], rel_tol = 0.003), 1, 0)

You could also use pandas' apply() function:
import math
from pandas_datareader import data as web
data = web.DataReader("goog", "yahoo")
data["lifetime"] = data["High"].asfreq("D").rolling(window=999999, min_periods=1).max()
data["isclose"] = data.apply(
lambda row: 1 if math.isclose(row["High"], row["lifetime"], rel_tol=0.003) else 0,
axis=1,
)
print(data)
However, yudhiesh's solution using np.where() is faster.
See also: Why is np.where faster than pd.apply

Related

Assign new values to entire row in DataFrame Python Pandas

How to assign new values to each row?
import pandas as pd
import numpy as np
for i in np.arange(len(dataset)):
if dataset['comment_num'].iloc[i] == 0:
dataset['words'].iloc[i] = 0
dataset['characters'].iloc[i] = 0
is working.
But it seems that neither
dataset[['words', 'characters']].iloc[i] = [0, 0]
nor
dataset[['words', 'characters']].iloc[i] = dataset[['words', 'characters']].iloc[i].replace([''],'0')
works.
Any suggestions will be greatly appreciated!

You can use loc function:
dataset.loc[i, ['words', 'characters']] = 0

Slow codes from functions in Python Pandas

I am trying to run a specific function (stats.boxcox) in a faster way in Python, but both codes I am using run very slow. Is there a way to do this in a more efficient way in Python?
First code:
import numpy as np
import pandas as pd
from scipy import stats
df = pd.DataFrame(np.random.randint(1,100,size=(100, 4)), columns=list('ABCD'))
df_new = pd.DataFrame()
for column in list(df):
df_new[column], lam = stats.boxcox(df[column])
Second code:
import numpy as np
import pandas as pd
from scipy import stats
df = pd.DataFrame(np.random.randint(1,100,size=(100, 4)), columns=list('ABCD'))
df2_a = df.apply(lambda x: stats.boxcox(x))
df2_al = list(zip(*df2_a))
df2 = pd.DataFrame(df2_al[0]).T

We can do it in a parallel way using ProcessPoolExecutor like this:
from concurrent.futures import ProcessPoolExecutor
# utils function
def cpu_tasks(func, *args):
# set chunksize to be even
with ProcessPoolExecutor(max_workers=None) as tp:
result = tp.map(func, chunksize=10, *args)
return list(result)
def get_box(s):
return stats.boxcox(s)[0]
# make column list
obj_lst = [df[x].tolist() for x in df.columns]
ddf = pd.DataFrame(cpu_tasks(get_box, obj_lst)).T
ddf.columns = list('ABCD')
print(ddf.head())
A B C D
0 8.906146 23.867250 23.682790 16.669473
1 16.948358 11.752727 18.120534 15.678950
2 23.892165 7.159087 1.693517 17.055528
3 11.210063 5.720299 29.496806 13.348581
4 6.588403 29.940734 35.874053 4.101704

How to optimize levenstien edit distance on pandas dataframe using python?

I am running levenstein comparison on 50k records. I need to compare each record between each other. Is there a way how to optimize the following code to run it faster? The data is stored in pandas dataframe.
import pandas as pd
import numpy as np
import Levenshtein
df_s_sorted = df.sort_values(['nonascii_2', 'birth_date'])
df_similarity = pd.DataFrame()
q=0
for index, p in df_s_sorted.iterrows():
q = q + 1
print(q)
for index, p1 in df_s_sorted.iterrows():
if ((p["birth_date"] == p1["birth_date"]) & (p["name"] != p1["name"] )):
if (Levenshtein.distance(p["name"],p1["name"]) == 1):
df_similarity = df_similarity.append(p)
print(p)
df_s_sorted.drop(index, inplace=True)

Add a vector/column of random numbers in Pandas

I've been trying to create a table that has randomly generated data using Pandas and Numpy. I've looked at the cheat sheet for Pandas but still can't get this work
import names
import pandas as pd
import random
import numpy as np
random.seed(100)
currency_numbers = random.sample(range(100000, 1000000), 100)
s = pd.Series(np.random.randn(100))
raw_data = {
"Names":["".join(names.get_full_name()) for i in range(100)],
"Names2":["".join(names.get_full_name()) for i in range(100)],
"Currency":[]
}
df = pd.DataFrame(raw_data, columns=["Names", "Names2", "Currency"])
df.head()
I want to create a column of 100 random numbers under the Currency section?

Just use the function: np.random.randint()
For example when I call this --> np.random.randint(1000,size=100)
The largest integer to be chosen in the random function is 999 aka anything from [0, 1000) and the size of the array would have a length of 100.
Therefore in your case,
s = np.random.randint(1000,size=100)
then set Currency to s,
"Currency":s
and the resulting DataFrame should give a column with 100 random numbers
JUST FYI, with this function you can also set a low and a high range...
So in your case it would be something like this:
s = np.random.randint(100000, 1000000,size=100)

Please check whether this helps.
import names
import pandas as pd
import random
import numpy as np
random.seed(100)
currency_numbers = np.random.randint(100000,1000000,size=(1,100))
s = pd.Series(np.random.randn(100))
raw_data = {
"Names":["".join(names.get_full_name()) for i in range(100)],
"Names2":["".join(names.get_full_name()) for i in range(100)],
"Currency":currency_numbers[0]
}
df = pd.DataFrame(raw_data, columns=["Names", "Names2", "Currency"])
df.head()

convert pandas df to multi-dimensional numpy array

I have very sparse data in a pandas dataframe with 25million+ records. This has to be converted into a multi dimensional numpy array. I have written this the straightforward way using a for loop, and was wondering if there is a more efficient way.
import numpy as np
import pandas as pd
facts_pd = pd.DataFrame.from_records(columns=['name','offset','code'],
data=[('John', -928, 'dx_434'), ('Steve',-757,'dx_5859'), ('Jack',-800,'dx_250'),
('John',-919,'dx_401'),('John',-956,'dx_5859')])
name_lu = pd.DataFrame(sorted(facts_pd['name'].unique()), columns=['name'])
name_lu["nameid"] = name_lu.index
offset_lu = pd.DataFrame(sorted(facts_pd['offset'].unique(), reverse=True), columns=['offset'])
offset_lu["offsetid"] = offset_lu.index
code_lu = pd.DataFrame(sorted(facts_pd['code'].unique()), columns=['code'])
code_lu["codeid"] = code_lu.index
facts_pd = pd.merge(pd.merge(pd.merge(facts_pd, name_lu, how="left", on="name")
, offset_lu, how="left", on="offset"), code_lu, how="left", on="code")
facts_pd.drop(["name","offset","code"], inplace=True, axis=1)
facts_np = np.zeros((len(name_lu),len(offset_lu),len(code_lu)))
for row in facts_pd.iterrows():
i,j,k = row[1]
facts_np[i][j][k] = 1

The command you are probably looking for is dataframe.as_matrix() this will return a numpy array and not a matrix despite what the command says here is the man pages for it.
Here is another stack overflow topic on the use of it as well

Refurbished code:
import numpy as np
import pandas as pd
facts_pd = pd.DataFrame.from_records(columns=['name','offset','code'],
data=[('John', -928, 'dx_434'), ('Steve',-757,'dx_5859'), ('Jack',-800,'dx_250'),
('John',-919,'dx_401'),('John',-956,'dx_5859')])
facts_np = facts_pd.as_matrix()
print facts_np # Displays the data frame in numpy array format.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

iterating over each row in pandas to evaluate condition - python

You can use np.where() import numpy as np import math data['isclose'] = np.where(math.isclose(data['High'], data['lifetime'], rel_tol = 0.003), 1, 0)

Related

Assign new values to entire row in DataFrame Python Pandas

Slow codes from functions in Python Pandas

How to optimize levenstien edit distance on pandas dataframe using python?

Add a vector/column of random numbers in Pandas

convert pandas df to multi-dimensional numpy array

Categories

Resources