import numpy as np
#Collect the compound values for each news source
score_table = df.pivot_table(index='User', values="Compound", aggfunc = np.mean)
score_table
from collections import Counter
import pandas as pd
a = dict(Counter(HT_positive))
t = list(a.items())
compound = score_table["Compound"]
df = pd.DataFrame(t, columns=["Hashtags", "Number of Occurence"])
df4 = df.append(compound)
df.to_csv('hashtags.csv', index=False)
df4_saved_file = pd.read_csv('hashtags.csv')
df4_saved_file
I'm getting a KeyError: "Compound". I understand how to add the "Compound" column in between "Hashtags" and "Number of Occurence"
I think that you could check if "Compound" or "User" should be the key that you use to query the value from the dictionary.
Related
I really do not understand how there was the command (if "entry" in langs_count) is possible when the dictionary was initialized to be empty, so what is inside the dictionary and how did it get there? I'm really confused
`
import pandas as pd
# Import Twitter data as DataFrame: df
df = pd.read_csv("tweets.csv")
# Initialize an empty dictionary: langs_count
langs_count = {}
# Extract column from DataFrame: col
col = df['lang']
# Iterate over lang column in DataFrame
for entry in col:
# If the language is in langs_count, add 1
if entry in langs_count.keys():
langs_count[entry]+=1
# Else add the language to langs_count, set the value to 1
else:
langs_count[entry]=1
# Print the populated dictionary
print(langs_count)
`
You can implement the count functionality using groupby.
import pandas as pd
# Import Twitter data as DataFrame: df
df = pd.read_csv("tweets.csv")
# Populate dictionary with count of occurrences in 'lang' column
langs_count = dict(df.groupby(['lang']).size())
# Print the populated dictionary
print(langs_count)
I am doing an Pandas exercise in which i need to do the following: Complete the function to clean a specified column in a dataframe by removing non-numeric characters, filling missing values with 0, and transforming the data type to a float.
The code i wrote is the following:
import pandas as pd
import regex as re
def clean_price_column(df, column):
df['price'] = df['price'].astype(str).str.extract('(\d+)', expand = False)
df['price'] = df.price.astype(float)
df['price'] = df['price'].fillna(0)
solution = df
return solution
test_case = clean_price_column(prices_df, 'price')
I am getting the right result but getting an error which i stated above, how can i fix it?
Possible solution is the following:
import pandas as pd
import numpy as np
def clean_price_column(df, column):
df[column] = df[column].astype(str).str.extract(r'(\d+)', expand = False)
df[column] = df[column].fillna(0)
df[column] = df[column].astype(float)
return df
# specify test data
data = {"price": ['$1234', '', 100, '200', None, np.nan, 'txt']}
# create pandas dataframe
prices_df = pd.DataFrame(data)
# apply function to pandas dataframe column
prices_df_updated = clean_price_column(prices_df, 'price')
prices_df_updated
Returns
Currently working with pybliometrics (scopus) I want to create a loop that allows me to get affiliation information from multiple authors.
Basically, this is the idea of my loop. How do I do that with many authors?
from pybliometrics.scopus import AuthorRetrieval
import pandas as pd
import numpy as np
au = AuthorRetrieval(authorid)
au.affiliation_history
au.identifier
x = au.identifier
refs2 = au.affiliation_history
len(refs2)
refs2
df = pd.DataFrame(refs2)
df.columns
a_history = df
df['authorid'] = x
#moving authorid to 0
cols = list(df)
cols.insert(0, cols.pop(cols.index('authorid')))
df = df.loc[:, cols]
df.to_excel("af_historyfinal.xlsx")
Turning your code into a loop over multiple author IDs? Nothing easier than that. Let's say AUTHOR_IDS equals 7004212771 and 57209617104:
import pandas as pd
from pybliometrics.scopus import AuthorRetrieval
def retrieve_affiliations(auth_id):
"""Author's affiliation history from Scopus as DataFrame."""
au = AuthorRetrieval(authorid)
df = pd.DataFrame(au.affiliation_history)
df["auth_id"] = au.identifier
return df
AUTHOR_IDS = [7004212771, 57209617104]
# Option 1, for few IDs
df = pd.concat([retrieve_affiliations(a) for a in AUTHOR_IDS])
# Option 2, for many IDs
df = pd.DataFrame():
for a in AUTHOR_IDS:
df = df.append(retrieve_affiliations(a))
# Have author ID as first column
df = df.set_index("authorid").reset_index()
df.to_excel("af_historyfinal.xlsx", index=False)
If, say, your IDs are in a comma-separated file called "input.csv", with one column called "authors", then you start with
AUTHOR_IDS = pd.read_csv("input.csv")["authors"].unique()
I have a table of "Borrower Personal ID" and "Loan ID".
BwrPersonld LoanId
113225 16330
113225 27073
113225 68842
113253 16341
113269 16348
113285 16354
113289 26768
113297 16360
113299 16361
113319 16369
113418 16403
113418 26854
I'm trying to know which loans belong to the same borrower. So I "groupby" the "BwrPersonalId" and "LoanId" like below.
Now I'm expecting like this.
Here is my code, but it doesn't work.
grouped = pd.DataFrame()
unique = loan['BwrPersonId'].unique()
grouped['BwrPersonId'] = ''*len(loan['BwrPersonId'].unique())
grouped['Loan1'] = ''
grouped['Loan2'] = ''
grouped['Loan3'] = ''
grouped['Loan4'] = ''
grouped['Loan5'] = ''
grouped.iloc[:,0] = unique
for i in grouped.index:
idloan = loan.loc[loan['BwrPersonId'] == unique[i], 'LoanId']
grouped.iloc[i,1:len(idloan)+1] = idloan
print(i)
How can I do it now? And is there any other way that can simplify the code? Thanks a lot for your help.
Basically, what you need to do is create a temp that will be utilizing the data that will be sorted, and the name that will be in charge of the Id to sort the Ids according to the loans.
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import count
dict = defaultdict(count)
id, name = pd.factorize([*zip(grouped.id, grouped.name)])
joined = np.array([next(dict[x]) for x in id])
lenOfr, Max = len(name), joined.max() + 1
temp = np.empty((lenOfr, Max), dtype=np.object)
temp[id, joined] = grouped.LoanId
df1 = pd.DataFrame(name.tolist(), columns=['BwrPersonId'])
df2 = pd.DataFrame(temp, columns=['Loan1', 'Loan2', 'Loan3', 'Loan4', 'Loan5'])
final = df1.join(df2)
I have two dataframes, and trying to find out a way to match the exact substring from one dataframe to another dataframe.
First DataFrame:
import pandas as pd
import numpy as np
random_data = {'Place Name':['TS~HOT_MD~h_PB~progra_VV~gogl', 'FM~uiosv_PB~emo_SZ~1x1_TG~bhv'],
'Site':['DV360', 'Adikteev']}
dataframe = pd.DataFrame(random_data)
print(dataframe)
Second DataFrame
test_data = {'code name': ['PB', 'PB', 'PB'],
'Actual':['programmatic me', 'emoteev', 'programmatic-mechanics'],
'code':['progra', 'emo', 'prog']}
test_dataframe = pd.DataFrame(test_data)
Approach
for k, l, m in zip(test_dataframe.iloc[:, 0], test_dataframe.iloc[:, 1], test_dataframe.iloc[:, 2]):
dataframe['Site'] = np.select([dataframe['Place Name'].str.contains(r'\b{}~{}\b'.format(k, m), regex=False)], [l],
default=dataframe['Site'])
The current output is as below, though I am expecting to match the exact substring, which is not working with the code above.
Current Output:
Place Name Site
TS~HOT_MD~h_PB~progra_VV~gogl programmatic-mechanics
FM~uiosv_PB~emo_SZ~1x1_TG~bhv emoteev
Expected Output:
Place Name Site
TS~HOT_MD~h_PB~progra_VV~gogl programmatic me
FM~uiosv_PB~emo_SZ~1x1_TG~bhv emoteev
Data
import pandas as pd
import numpy as np
random_data = {'Place Name':['TS~HOT_MD~h_PB~progra_VV~gogl',
'FM~uiosv_PB~emo_SZ~1x1_TG~bhv'], 'Site':['DV360', 'Adikteev']}
dataframe = pd.DataFrame(random_data)
test_data = {'code name': ['PB', 'PB', 'PB'], 'Actual':['programmatic me', 'emoteev', 'programmatic-mechanics'],
'code':['progra', 'emo', 'prog']}
test_dataframe = pd.DataFrame(test_data)
Map the test_datframe code and Actual into dictionary as key and value respectively
keys=test_dataframe['code'].values.tolist()
dicto=dict(zip(test_dataframe.code, test_dataframe.Actual))
dicto
Join the keys separated by | to enable search of either phrases
k = '|'.join(r"{}".format(x) for x in dicto.keys())
k
Extract string from datframe meeting any of the phrases in k and map them to to the dictionary
dataframe['Site'] = dataframe['Place Name'].str.extract('('+ k + ')', expand=False).map(dicto)
dataframe
Output
Not the most elegant solution, but this does the trick.
Set up data
import pandas as pd
import numpy as np
random_data = {'Place Name':['TS~HOT_MD~h_PB~progra_VV~gogl',
'FM~uiosv_PB~emo_SZ~1x1_TG~bhv'], 'Site':['DV360', 'Adikteev']}
dataframe = pd.DataFrame(random_data)
test_data = {'code name': ['PB', 'PB', 'PB'], 'Actual':['programmatic me', 'emoteev', 'programmatic-mechanics'],
'code':['progra', 'emo', 'prog']}
test_dataframe = pd.DataFrame(test_data)
Solution
Create a column in test_dataframe with the substring to match:
test_dataframe['match_str'] = test_dataframe['code name'] + '~' + test_dataframe.code
print(test_dataframe)
code name Actual code match_str
0 PB programmatic me progra PB~progra
1 PB emoteev emo PB~emo
2 PB programmatic-mechanics prog PB~prog
Define a function to apply to test_dataframe:
def match_string(row, dataframe):
ind = row.name
try:
if row[-1] in dataframe.loc[ind, 'Place Name']:
return row[1]
else:
return dataframe.loc[ind, 'Site']
except KeyError:
# More rows in test_dataframe than there are in dataframe
pass
# Apply match_string and assign back to dataframe
dataframe['Site'] = test_dataframe.apply(match_string, args=(dataframe,), axis=1)
Output:
Place Name Site
0 TS~HOT_MD~h_PB~progra_VV~gogl programmatic me
1 FM~uiosv_PB~emo_SZ~1x1_TG~bhv emoteev