Pandas: Difficulty Adding New Columns - python

I have a Pandas dataframe, numeric_df, with a bunch of columns. I have this function:
def textstat_stats(text):
difficulty = textstat.flesch_reading_ease(text)
grade_difficulty = textstat.flesch_kincaid_grade(text)
gfog = textstat.gunning_fog(text)
smog = textstat.smog_index(text)
ari = textstat.automated_readability_index(text)
cli = textstat.coleman_liau_index(text)
lwf = textstat.linsear_write_formula(text)
dcrs = textstat.dale_chall_readability_score(text)
return pd.Series([difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs])
which returns a Pandas Series. Now I'm trying this:
numeric_df[['difficulty', 'grade_difficulty','gfog','smog','ari','cli','lwf','dcrs']] = textstat_stats(text)
However, I get this Error:
KeyError: "['difficulty' 'grade_difficulty' 'gfog' 'smog' 'ari' 'cli' 'lwf' 'dcrs'] not in index"
What am I doing incorrectly?
Thanks!

It seems you need add index to Series which create columns names:
def textstat_stats(text):
difficulty = textstat.flesch_reading_ease(text)
grade_difficulty = textstat.flesch_kincaid_grade(text)
gfog = textstat.gunning_fog(text)
smog = textstat.smog_index(text)
ari = textstat.automated_readability_index(text)
cli = textstat.coleman_liau_index(text)
lwf = textstat.linsear_write_formula(text)
dcrs = textstat.dale_chall_readability_score(text)
idx = ['difficulty', 'grade_difficulty','gfog','smog','ari','cli','lwf','dcrs']
return pd.Series([difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs],
index=idx)
df = textstat_stats(text)
print (df)

Related

I canĀ“t figure out why my function is returning after first interaction

I have the following function but it is returning only the first for interaction. And I am stuck on this
def window_input(window_length: int, data: pd.DataFrame) -> pd.DataFrame:
appended_data3 = []
df = df_per_station.copy()
for cols in columns_to_shift:
i = 1
while i < window_length:
df[cols + f'x{i}'] = df[cols].shift(-i)
i = i + 1
if i == window_length:
df['next_count'] = df['count'].shift(-i)
# Drop rows where there is a NaN
df = df.dropna(axis=0)
appended_data3.append(df)
appended_data3 = pd.concat(appended_data3)
return appended_data3

How to clean up older data for a dictionary of dataframes?

This is a snippet of my code. I need to clean up one day of older data. how do we do that for a dictionary of dataframes?
master_train_dict = {}
for id in list_of_id:
temp_df = df.loc[df["id"] == id].copy(deep=False)
temp_df.drop('id', axis=1, inplace=True)
temp_df.reset_index(drop=True, inplace=True)
alert_list = list(temp_df["title"])
train_embedding = get_embeddings(alert_list, model)
temp_df["train_embedding"] = train_embedding
master_train_dict[parent_id] =
temp_df[["title","train_embedding","#timestamp"]]
#master_train_dict[parent_id] = temp_df
global master_dict
master_dict = master_train_dict
print(master_dict)
#clean up function
if len(master_dict)>0:
d = datetime.today() - timedelta(hours=1, minutes= 0)
master_dict=master_dict[id]['#timestamp']>d.strftime("%Y-%m-%d %H:%M:%S")
print(master_dict)
Consider working in defined methods and use groupby for building list or dict of subsetted data frames. Then call functions via dictionary comprehensions.
def build_df(sub):
sub_df.drop('id', axis=1, inplace=True)
sub_df.reset_index(drop=True, inplace=True)
alert_list = list(sub_df["title"])
train_embedding = get_embeddings(alert_list, model)
sub_df["train_embedding"] = train_embedding
sub_df = sub_df.reindex(["title","train_embedding","#timestamp"], axis="columns")
return sub_df
master_train_dict = {i:build_df(g) for i, g in df.groupby(["id"])}
def clean_df(df):
d = datetime.today() - timedelta(hours=1, minutes= 0)
df = df[df['#timestamp'] > d.strftime("%Y-%m-%d %H:%M:%S")]
return df
clean_master_train_dict = {k:clean_df(v) for k, v in master_train_dict.items()}

How do I update a value in a dataframe in a loop?

I am trying to update a rating row by row. I have one dataframe of players, that all start with the same rating. For each match, I want the rating to change. Another dataframe contains results of each match.
import pandas as pd
gamesdata = [['paul','tom'],['paul','lisa'],['tom','paul'],['lisa','tom'],['paul','lisa'],['lisa','tom'],['paul','tom']]
games = pd.DataFrame(gamesdata, columns = ['Winner', 'Looser'])
playersdata= ['lisa','paul','tom']
players = pd.DataFrame(playersdata, columns = ['Name'])
mean_elo = 1000
elo_width = 400
k_factor = 64
players['elo'] = mean_elo
def update_elo(winner_elo, loser_elo):
expected_win = expected_result(winner_elo, loser_elo)
change_in_elo = k_factor * (1-expected_win)
winner_elo += change_in_elo
loser_elo -= change_in_elo
return winner_elo, loser_elo
def expected_result(elo_a, elo_b):
expect_a = 1.0/(1+10**((elo_b - elo_a)/elo_width))
return expect_a
for index, row in games.iterrows():
winnername = row['Winner']
losername = row['Looser']
web = players['elo'].loc[players['Name'] == winnername].values[0]
wIndex = players.loc[players['Name'] == winnername]
#I want to return just the index, so I can update the value
print(wIndex)
leb = players['elo'].loc[players['Name'] == losername].values[0]
print('Winner Elo before: ' + str(web))
winner_elo, looser_elo = update_elo(web, leb)
print('Winner Elo after: ' + str(winner_elo))
#here I want to update value
#players.at[wIndex,'elo']=winner_elo
I am trying to update the value in the players table using
players.at[wIndex,'elo']=winner_elo
but i struggle to get the index with this code:
wIndex = players.loc[players['Name'] == winnername]
Found a sollution:
wIndex = players.loc[players['Name'] == winnername].index.values
Can't believe i missed that

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

Script not working anymore, .group() used to work but now is throwing an error

This is the portion of the code that's causing trouble:
import pandas as pd
import re
df
df.columns = ['Campaigns', 'Impressions', 'Attempts', 'Spend']
Campaigns = df['Campaigns']
IDs = []
for c in Campaigns:
num = re.search(r'\d{6}',c).group()
IDs.append(num)
pieces = [df,pd.DataFrame(IDs)]
frame = pd.concat(pieces, axis=1, join='outer',ignore_index=False)
frame['ID'] = frame[0]
del frame[0]
frame
This is the error:
Error: 'NoneType' object has no attribute 'group'
When I try things individually in ipython everything works, for example:
in>> test = 'YP_WON2_SP8_115436'
in>> num = re.search(r'\d{6}',test)
in>> num.group()
out>> '115436'
I've tried splitting up the code as above and it still throws the same error.
Fixed the code:
df
df.columns = ['Campaigns', 'Impressions', 'Attempts', 'Spend']
Campaigns = df['Campaigns']
ID = []
for c in Campaigns:
m = re.search(r'\d{6}',c)
if m:
num = re.search(r'\d{6}',c).group()
ID.append(num)
else:
ID.append('No ID')
pieces = [df,pd.DataFrame(ID)]
frame = pd.concat(pieces, axis=1, join='outer',ignore_index=False)
frame['ID'] = frame[0]
del frame[0]
frame

Categories