I'm trying to find a more efficient way to do this;
Creating a data frame with pandas that calls a faker function that retrieves a string(a bunch of times) for each column.
import pandas as pd
from faker import Faker
fake = Faker()
def createDF(size):
df = pd.DataFrame()
df["Name"] = [fake.name() for _ in range(size)]
df["Email"] = [fake.free_email() for _ in range(size)]
df["Address"] = [fake.address() for _ in range(size)]
df["Phone"] = [fake.phone_number() for _ in range(size)]
df["Comment"] = [fake.text() for _ in range(size)]
return df
This are failed attempts to do so, just to showcase what I've already tried to do.
def create_0(size):
df = pd.DataFrame(
{"Name": [fake.name() for _ in range(size)],
"Email": [fake.free_email() for _ in range(size)],
"Address": [fake.address() for _ in range(size)],
"Phone": [fake.phone_number() for _ in range(size)],
"Comment": [fake.text() for _ in range(size)]
}
)
return df
def create_1(size):
df = pd.DataFrame()
df["Name"] = [fake.name()] * size
df["Email"] = [fake.free_email()] * size
df["Address"] = [fake.address()] * size
df["Phone"] = [fake.phone_number()] * size
df["Comment"] = [fake.text()] * size
return df
def create_2(size):
names = []
emails = []
addresses = []
phones = []
comments = []
for _ in range(size):
names.append(fake.name())
emails.append(fake.name())
addresses.append(fake.address())
phones.append(fake.phone_number())
comments.append(fake.text())
df = pd.DataFrame({"Name":names,"Email":emails,"Address":addresses,"Phone":phones,"Comment":comments})
return df
def create_3(size):
df = pd.DataFrame()
df["Name"] = list(itertools.repeat(df.apply(fake.name, axis=1), size))
df["Email"] = list(itertools.repeat(df.apply(fake.free_email, axis=1), size))
df["Address"] = list(itertools.repeat(df.apply(fake.address, axis=1), size))
df["Phone"] = list(itertools.repeat(df.apply(fake.phone_number, axis=1), size))
df["Comment"] = list(itertools.repeat(df.apply(fake.text, axis=1), size))
return df
import itertools
def create_4(size):
df = pd.DataFrame()
df["Name"] = itertools.repeat((df.apply(lambda : fake.name(), axis=1)), size)
df["Email"] = itertools.repeat((df.apply(lambda : fake.name(), axis=1)), size)
df["Address"] = itertools.repeat((df.apply(lambda : fake.name(), axis=1)), size)
df["Phone"] = itertools.repeat((df.apply(lambda : fake.name(), axis=1)), size)
df["Comment"] = itertools.repeat((df.apply(lambda : fake.name(), axis=1)), size)
return df
I've also read online different approaches with pandas .map and .Series but not very sure how to implement them.
No need to loop multiple times
def createDF(size):
data= {'Name':[],
'Email':[],
'Address':[],
'Phone':[],
'Comment':[]}
for _ in range(size):
data['Name'].append(fake.name())
data['Email'].append(fake.free_email())
data['Address'].append(fake.address())
data['Phone'].append(fake.phone_number())
data['Comment'].append(fake.text())
return pd.DataFrame(data)
Related
How to rank the data frame based on the row value. i.e I have a row that contains text data want to provide the rank based on the similarity?
Expected output
i have tried with the levistian distance but not sure how can i do for the whole table
def bow(x=None):
x = x.lower()
words = x.split(' ')
words.sort()
x = ' '.join(words)
exclude = set('{}{}'.format(string.punctuation, string.digits))
x = ''.join(ch for ch in x if ch not in exclude)
x = '{} '.format(x.strip())
return x
#intents = load_intents(export=True)
df['bow'] = df['name'].apply(lambda x: bow(x))
df.sort_values(by='bow',ascending=True,inplace=True)
last_bow = ''
recs = []
for idx,row in df.iterrows():
record = {
'name': row['name'],
'bow': row['bow'],
'lev_distance': ed.eval(last_bow,row['bow'])
}
recs.append(record)
last_bow = row['bow']
intents = pd.DataFrame(recs,columns=['name','bow','lev_distance'])
l = intents[intents['lev_distance'] <= lev_distance_range]
r = []
for x in l.index.values:
r.append(x - 1)
r.append(x)
r = list(set(r))
l = intents.iloc[r,:]
Using textdistance, you could try this:
import pandas as pd
import textdistance
df = pd.DataFrame(
{
"text": [
"Rahul dsa",
"Rasul dsad",
"Raul ascs",
"shrez",
"Indya",
"Indi",
"shez",
"india",
"kloa",
"klsnsd",
],
}
)
df = (
df
.assign(
match=df["text"].map(
lambda x: [
i
for i, text in enumerate(df["text"])
if textdistance.jaro_winkler(x, text) >= 0.9
]
)
)
.sort_values(by="match")
.drop(columns="match")
)
print(df)
# Output
text
0 Rahul dsa
1 Rasul dsad
2 Raul ascs
3 shrez
6 shez
4 Indya
5 Indi
7 india
8 kloa
9 klsnsd
I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs.
I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe.
The problem is it is very very slow, and become slower while running.
The mainly code is:
def process_df(df, func, *args, **kwargs):
'''
'''
product_ids = df.product_id.unique()
store_ids = df.store_id.unique()
# uk = df.drop_duplicates(subset=['store_id','product_id'])
# for idx, item in uk.iterrows():
all_df = list()
i = 1
with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t:
for store_id in store_ids:
sdf = df.loc[df['store_id']==store_id]
for product_id in product_ids:
new_df = sdf.loc[(sdf['product_id']==product_id) ]
if new_df.shape[0] < 14:
continue
new_df = func(new_df, *args, **kwargs)
new_df.loc[:, 'store_id'] = store_id
new_df.loc[:, 'product_id'] = product_id
all_df.append(new_df)
t.update()
all_df= pd.concat(all_df)
return all_df
def process_order_items(df, store_id=None, product_id=None, freq='D'):
if store_id and "store_id" in df.columns:
df = df.loc[df['store_id']==store_id]
if product_id and "product_id" in df.columns:
df = df.loc[df['product_id']==product_id]
# convert to datetime
df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]]
df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int)
df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
if not freq:
df.loc[:, "date_create"] = df["datetime_create"]
else:
assert freq in ('D', 'H')
df.index = df.loc[:, "datetime_create"]
discount_order_count = df['has_discount'].resample(freq).sum()
clearance_order_count = df['clearance'].resample(freq).sum()
discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum()
no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum()
no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum()
order_count = df['count'].resample(freq).count()
day_count = df['count'].resample(freq).sum()
price_guide = df['price_guide'].resample(freq).max()
price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count
df = pd.DataFrame({
"price":price_avg,
"price_guide": price_guide,
"sale_count": day_count,
"order_count": order_count,
"discount_order_count": discount_order_count,
"clearance_order_count": clearance_order_count,
"discount_sale_count": discount_sale_count,
"clearance_sale_count": clearance_sale_count,
})
df = df.drop(df[df.order_count == 0].index)
return df
I think the problem is there are too many redundant selections.
Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.
I think you can change:
df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
to Series.isin:
df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int)
Also solution for Resampler.aggregate:
d = {'has_discount':'sum',
'clearance':'sum',
'count': ['count', 'sum'],
'price_guide':'max'}
df1 = df.resample(freq).agg(d)
df1.columns = df1.columns.map('_'.join)
d1 = {'has_discount_count':'discount_order_count',
'clearance_count':'clearance_order_count',
'count_count':'order_count',
'count_sum':'day_count',
'price_guide_max':'price_guide'}
df1.rename(columns=d1)
Another idea is no convert boolean mask to integer, but use columns for filtering like:
df["has_discount"] = df.price_change_category > 0
df["clearance"] = df.price_change_category.isin([10, 20, 23])
discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum()
#for filtering ==0 invert boolean mask columns by ~
no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum()
no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum()
First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary:
def f(x):
print (x)
df = df.groupby(['product_id','store_id']).apply(f)
Cost function implemented with Python:
**Thanks for help to achieve this.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
load_data = pd.read_csv('C:\python_program\ex1data1.txt',sep = ",",header = None)
feature_vale = load_data[0]
y = np.matrix(load_data[1])
m = len(feature_vale)
plt.scatter(load_data[0],load_data[1],marker='+',c = 'r')
plt.title("Cost_Function")
plt.xlabel("Population of City in 10,000s")
plt.ylabel("Profit in $10,000s")
df = pd.DataFrame(pd.Series(1,index= range(0,m)))
df[1] = load_data[0]
X = np.matrix(df)
row_theta = np.zeros(2,dtype = int)
theta = np.array([row_theta]) # Transpose the array
prediction = np.dot(X,theta.T)
error = (prediction-y.T)
error_df = pd.DataFrame(error)
#square the error
squared_error = np.square(error_df)
sum = np.sum(squared_error)
print(sum)
J = np.sum(squared_error) / (2 * m)
print(J)
Data reference link: searchcode.com/codesearch/view/5404318
repeat the following steps and let me know
load_data = pd.read_csv('data.txt',sep = ",",header = None)
feature_vale = load_data[0]
y = np.matrix(load_data[1])
m = len(feature_vale)
#print(m)
#plt.scatter(load_data[0],load_data[1])
df = pd.DataFrame(pd.Series(1,index= range(0,m)))
df[1] = load_data[0]
X = np.matrix(df)
row_theta = np.zeros(2,dtype = int)
theta = np.array([row_theta]) # Transpose the array
print(theta.T)
prediction = np.matmul(X,theta.T)
error = (prediction-y)
error_df = pd.DataFrame(error)
squared_error = np.square(error_df)
print(squared_error)
I have this code
from sklearn import tree
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
train["Embarked"] = train["Embarked"].fillna("S")
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
test.Fare[152] = test["Fare"].median()
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
test["Embarked"] = test["Embarked"].fillna("S")
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values
my_prediction = my_tree_one.predict(test_features)
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId)
my_solution.to_csv("5.csv", index_label = ["PassangerId", "Survived"])
As you can see I only want save a csv with two columns, but when I look at the file 5.csv it's added another column called 0..Anybody know why?
You're seeing this behaviour because you're adding two index_labels when there is only one index.
You can instead name your one column as such:
my_solution.columns = ['Survived']
And then label your index like so:
my_solution.to_csv("5.csv", index_label=["PassengerId"])
Try this slightly optimized solution:
from sklearn import tree
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
cols = ["Pclass", "Sex", "Age", "Fare"]
mappings = {
'Sex': {'male':0, 'female':1},
}
def cleanup(df, mappings=mappings):
# map non-numeric columns
for c in mappings.keys():
df[c] = df[c].map(mappings[c])
# replace NaN's with average value
for c in df.columns[df.isnull().any()]:
df[c].fillna(df[c].mean(), inplace=True)
return df
# parse train data set
train = cleanup(d.read_csv(train_url, usecols=cols + ['Survived']))
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one.fit(train.drop('Survived',1), train['Survived'])
# parse test data set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url, usecols=cols+['PassengerId'])
result = test.pop('PassengerId').to_frame('PassengerId')
test = cleanup(test)
result['Survived'] = my_tree_one.predict(test)
result.to_csv("5.csv", index=False)
NB My code runs if copied
I wrote a simple script to backtest cryptocurrencies using the poloniex API.
First I request the data from the API and turn it into a dataframe data.
Then I take the data I want and make new df called df
A function trade must then be run on each line in df, simple put if the price is above the rolling mean it buys and sells if below, this data is then saved in log.
I am having trouble applying this function on each row in df.
I had great success using the line log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1) BUT surprising it works when BTC_ETH is used in the API call and not for others ie BTC_FCT or BTC_DOGE despite the data being identical in form. Using ETH results in the creation of DataFrame (which is what i want) DOGE and FCT creates a Series
First question, how can I run my trade function on each row and create a new df log with the results
Bonus question, even though the data types are the same why does it work for ETH but not for DOGE/FCT ?
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData¤cyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
print()
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
EDIT:
I solved the problem, I fixed it by appending the dicts to list and then using the df.from_dict() method to create the log dataframe, my code just to clarify.
def trade(date, close, MA):#, port):
#d = {'Data': close}
#test_log = test_log.append(d, ignore_index=True)
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
for key,value in port.items():
print(key, value )
log.from_dict(data_list)
The problem is that you are not always returning a value in trade, which is confusing Pandas. Try this:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData¤cyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
d = {'Date': date, 'type':'', 'coin_value': np.nan, 'btc_value': np.nan}
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d['type'] = 'buy'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d['type'] = 'sell'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
However, as I mentioned in the comment, passing a function with side-effects to apply is not a good idea according to the documentation, and in fact I think it may not produce the correct result in your case.