Method to grab Pandas columns Python - python

I am trying to learn OOP and want to convert some code I have so.
My code:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
# assign previous day to variable, then group and sum
prev_day = pd.read_csv('C:/Users/Name/PycharmProjects/Corona Stats/TimeSeries/03-28-2020.csv')
prev_day = prev_day.replace(np.nan, 'Other', regex=True)
prev_day = prev_day.groupby(['Country_Region']).sum()
prev_day = prev_day.reset_index()
# assign current day to variable, removed unwanted columns, then group and sum
stats_reader = pd.read_csv('C:/Users/Name/PycharmProjects/Corona Stats/TimeSeries/03-29-2020.csv')
stats_reader = stats_reader.replace(np.nan, 'Other', regex=True)
stats_clean = stats_reader.drop(['FIPS', 'Last_Update', 'Lat', 'Long_'], axis=1)
stats_clean = stats_clean.rename(columns={
'Admin2': 'County', 'Province_State': 'State', 'Country_Region': 'Country', 'Combined_Key': 'City'})
stats_clean = stats_clean.groupby(['Country']).sum()
stats_clean = stats_clean.reset_index()
# add in new columns to show difference between days
stats_clean['New Cases'] = stats_clean['Confirmed'] - prev_day['Confirmed']
stats_clean['New Deaths'] = stats_clean['Deaths'] - prev_day['Deaths']
stats_clean['New Recovered'] = stats_clean['Recovered'] - prev_day['Recovered']
stats_clean = stats_clean[[
'Country', 'Confirmed', 'New Cases',
'Deaths', 'New Deaths', 'Recovered', 'New Recovered', 'Active']]
stats_clean = stats_clean.replace(np.nan, 0, regex=True)
# calculate for global cases from previous day
prev_sum = prev_day.sum()
prev_sum['Country'] = 'World'
prev_sum = prev_sum[['Country', 'Confirmed', 'Deaths', 'Recovered']]
prev_sum = prev_sum.replace(np.nan, 0, regex=True)
# calculate for global cases for current day
sum_stats = stats_clean.sum()
sum_stats['Country'] = 'World'
sum_stats['New Cases'] = sum_stats['Confirmed'] - prev_sum['Confirmed']
sum_stats = sum_stats.replace(np.nan, 0, regex=True)
sum_stats = sum_stats[[
'Country', 'Confirmed', 'New Cases', 'Deaths', 'New Deaths', 'Recovered', 'New Recovered', 'Active']]
My first attempt:
class Corona:
def __init__(self):
pass
def country_sum(self, country):
country = stats_clean['Country'].isin([country])
print(country)
Corona.country(US)
If I make this a static method, it runs, but I am not using the argument in country_sum. I want to filter for whatever country is passed.
I don't know how to use the argument in a method to filter for values in a column.
Sample rows from the original csv file:
FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
45001,Abbeville,South Carolina,US,2020-03-29 23:08:25,34.22333378,-82.46170658,3,0,0,0,"Abbeville, South Carolina, US"
22001,Acadia,Louisiana,US,2020-03-29 23:08:25,30.295064899999996,-92.41419698,9,1,0,0,"Acadia, Louisiana, US"
51001,Accomack,Virginia,US,2020-03-29 23:08:25,37.76707161,-75.63234615,3,0,0,0,"Accomack, Virginia, US"
16001,Ada,Idaho,US,2020-03-29 23:08:25,43.4526575,-116.24155159999998,92,1,0,0,"Ada, Idaho, US"
19001,Adair,Iowa,US,2020-03-29 23:08:25,41.33075609,-94.47105874,1,0,0,0,"Adair, Iowa, US"

If I am not mistaken, you should not perform all the calculation somewhere else outside the class and then access variable e.g. stats_clean defined in the global scope.
You should rather be doing it like this:
class Corona:
root_dir = "<path-to-your-data-dir>"
def __init__(self, date):
self.file = os.path.join(self.root_dir, str(date) + ".csv") # use glob or something if you want to process multiple files etc.
self._calculate_stats()
def _calculate_stats(self):
<do all your reading dataset and calculations here>
<....>
self.stats_clean = ...
self.prev_sum = ...
def country_sum(self, country = 'US'):
return self.stats_clean['Country'].isin([country])
Then you can simply do:
corona = Corona('03-29-2020')
print(corona.country_sum(<your-country>))
This is just one way of doing it.

Related

Iterows replacement for a calculation between each row of one dataframe to another

I'm trying to move away from iterows due to it's poor proformance. I can't however find another solution to comparing each row of one dataframe with each row from another dataframe.
I have two dataframes each containing a latitude and a longitude. Previously I have used these two functions to make a distance calculation between the two coordinates shown here:
def find_matches(first_HL, second_HL, N, M):
program_start = time.time()
matched_sites_df = pd.DataFrame()
for i_WP, r_WP in first_HL.iterrows():
series = pd.Series(dtype=float)
if r_WP['PL Name'] is not None and r_WP['PL Latitude'] is not None and r_WP['PL Longitude'] is not None:
series = name_and_distance_match(i_WP, r_WP, second_HL, N, M)
if series is not None:
series = pd.DataFrame(series.to_frame().T)
matched_sites_df = pd.concat([matched_sites_df, series], axis=0, ignore_index=True)
now = time.time()
print("------ MATCH FOUND ------ ", r_WP['PL Name'], "------", round(now - program_start, 2), "seconds")
return matched_sites_df
def calc_distance(r_WP, r_HL):
coords_1 = (r_WP['PL Latitude'], r_WP['PL Longitude'])
coords_2 = (r_HL['Latitude'], r_HL['Longitude'])
distance_km = round(geopy.distance.geodesic(coords_1, coords_2).km, 2)
return distance_km
def name_and_distance_match(i_WP, r_WP, second_HL, N, M):
for i_HL, r_HL in second_HL.iterrows():
if pd.isnull(r_HL['Site Name']) or pd.isnull(r_WP['PL Name']) == True:
pass
elif abs(r_WP['PL Latitude'] - r_HL['Latitude']) > 0.1:
pass
elif abs(r_WP['PL Longitude'] - r_HL['Longitude']) > 0.1:
pass
else:
distance_km = r_WP['Distance (km)'] = calc_distance(r_WP, r_HL)
if distance_km < M:
r_HL = filter_town(r_WP, r_HL)
score = r_WP['Name Similarity'] = np.vectorize(fuzzy)(r_HL["HL Site Short"], r_WP['PL Name'])
if score > N:
r_WP["HL Site Short"] = r_HL["HL Site Short"]
return r_WP
Is there a way I can do this without iterows?
The solution I'm working on at the moment looks like this:
def distance_check(first_HL, second_WPHL):
first_lat = first_HL["Latitude"]
first_long = second_WPHL["PL Longitude"]
second_lat = first_HL["Latitude"]
second_long = second_WPHL["PL Longitude"]
if abs(first_lat - second_lat) + abs(first_long - second_long) > 0.2:
return False
else:
COMBINED_HOUSELIST["WHATPUB Site Name"] = PUBMATCH_WHATPUB_SITES["Site Name"]
return True
PUBMATCH_WHATPUB_SITES
COMBINED_HOUSELIST["Distance Check"] = COMBINED_HOUSELIST.apply(distance_check(PUBMATCH_WHATPUB_SITES, COMBINED_HOUSELIST), axis=1)
Any help would be greatly appreciated, thank you.
EDIT: Example Dataframes
COMBINED_HOUSELIST = pd.DataFrame(np.array([["12345", "Wrexham Cwtch", "52.10", "-2.06"], ["12354", "Horse & Hound", "52.21", "-1.95"], ["12435", "Round Of Gras Badsey", "52.33", "-1.99"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
PUBMATCH_WHATPUB_SITES= pd.DataFrame(np.array([["52938", "Valkyrie Café Bar", "53.22", "-3.00"], ["12435", "Round Of Badsey", "52.33", "-1.99"], ["12345", "Cwtch", "52.11", "-2.00"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
Desired output
matched_sites = pd.DataFrame(np.array([["12345", "Wrexham Cwtch", "52.10", "-2.06"], ["12354", "Horse & Hound", "52.21", "-1.95"], ["12435", "Round Of Gras Badsey", "52.33", "-1.99"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
One way or another, I fear that you will have to resort to some form of iteration, but doing it outside of Pandas might speed things up.
So, here is one way to do it with map and partial functions from Python standard library.
First, define two helper functions:
from functools import partial
def calc_distance(coo1, coo2):
return abs(coo1[0] - coo2[0]) + abs(coo1[1] - coo2[1])
def find_matches(one_list, another_list, threshold):
idx = []
for coo in one_list:
func = partial(calc_distance, coo)
results = [result for result in map(func, another_list)]
idx.append([results.index(result) for result in results if result <= threshold])
return idx
Then, with the following toy dataframes:
import pandas as pd
import numpy as np
COMBINED_HOUSELIST = pd.DataFrame(
np.array(
[
["12345", "Wrexham Cwtch", "52.10", "-2.06"],
["12354", "Horse & Hound", "52.21", "-1.95"],
["12435", "Round Of Gras Badsey", "52.33", "-1.99"],
]
),
columns=["Site Number", "Site Name", "Longitude", "Latitude"],
)
PUBMATCH_WHATPUB_SITES = pd.DataFrame(
np.array(
[
["52938", "Valkyrie Café Bar", "53.22", "-3.00"],
["54999", "New Café Bar", "52.10", "-2.1"],
["12435", "Round Of Badsey", "52.33", "-1.99"],
["12345", "Cwtch", "52.11", "-2.00"],
]
),
columns=["Site Number", "Site Name", "Longitude", "Latitude"],
)
You can proceed like this:
# Setup
for col in ["Latitude", "Longitude"]:
for df in [COMBINED_HOUSELIST, PUBMATCH_WHATPUB_SITES]:
df[col] = pd.to_numeric(df[col])
# Get two lists of coordinates looking like [[lat, long], [lat, long],...]
CH_COO = COMBINED_HOUSELIST.loc[:, ["Latitude", "Longitude"]].to_dict("split")["data"]
PW_COO = PUBMATCH_WHATPUB_SITES.loc[:, ["Latitude", "Longitude"]].to_dict("split")[
"data"
]
# Look for matches
COMBINED_HOUSELIST = COMBINED_HOUSELIST.assign(match=find_matches(CH_COO, PW_COO, 0.1))
# Get site names
COMBINED_HOUSELIST["match"] = COMBINED_HOUSELIST.apply(
lambda x: [PUBMATCH_WHATPUB_SITES.loc[idx, "Site Name"] for idx in x["match"]],
axis=1,
)
Finally, print(COMBINED_HOUSELIST):

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

How to replace a value in python pandas using a for loop

I am analyzing data from an experiment, and have to combine measurements from two devices (variables "data" and "behavioral").
I want to be able to walk through the recordings in the "data" data frame, and then, if a specific condition is met (if x == Stimulus/S 2), I want it to be replaced with the corresponding value in the "behavior" datarame.
This is my code so far:
import os
import csv
import pandas as pd
os.chdir('C:/Users/eeg1/Desktop/NSM-Pilot')
col_names2 = ['Stimulus Name', 'Stimulus Content', 'Trials', 'This Rep Number', 'trials.thisTrialN', 'trials.thisN', 'trials.thisIndex', 'Button Box Keys', 'Button Box RT', 'Date', 'Frame Rate', 'expName', 'Session', 'Participant' ]
behavioral = pd.read_csv('NSM-01-behavioral.csv', names=col_names2, skiprows=[0])
counter2 = 0
for y in behavioral['Stimulus Content']:
counter2 += 1
col_names = ['Event Number', 'Onset', 'Duration', 'Description']
data = pd.read_csv(r'NSM-01-01.csv', names=col_names, skiprows=[0])
counter = 0
for x in data['Description']:
if x == 'Stimulus/S 2':
x = behavioral['Stimulus Content'][counter]
counter += 1
The problem is that this part seems to have no effect:
x = behavioral['Stimulus Content'][counter]

How to use pandas INPUT function to get a list of customers

I have created a code to get users of my platform based on 2 things:
choiceTitle: search for a specific word contained in the title of an Ad that users of my platform have looked at. For eg, the Ad is "We are offering free Gin" and I want to get the word 'Gin'
PrimaryTagPreviousChoice: the Ad has a "Food and Drink" tag
I can get those users who are interested in Gin and Food and Drink with:
(df2['choiceTitle'].str.contains("(?i)Gin")) & (df2['PrimaryTagPreviousChoice'].str.contains("(?i)Food and Drink"))
What I'd like to do is create a function with all my code inside (hence the sql query, the rename operation, the sort_values ​​operation etc....) and then use the INPUT function. So I'll just have to run my code, so that python will ask me 2 questions:
choiceTitle? ... Gin
PrimaryTagPreviousChoice? ...Food and Drink.
I enter the 2 options and it gives me the users interested in, let's say, Gin and Food and Drink.
How can I do it?
MY CODE:
df = pd.read_sql_query(""" select etc..... """, con)
df1 = pd.read_sql_query(""" select etc..... """, con)
df1['user_id'] = df1['user_id'].apply(str)
df2 = pd.merge(df, df1, left_on='user_id', right_on='user_id', how='left')
tag = df2[
(df2['choiceTitle'].str.contains("(?i)Gin")) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)Food and Drink"))
]
dw = tag[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
dw = dw.sort_values(['last_login'], ascending=[False])
dw = dw[dw.last_login > dt.datetime.now() - pd.to_timedelta("30day")]
dw = dw.rename({'user': 'user full name', 'title': 'user title'}
, axis='columns')
dw.drop_duplicates(subset ="Email",
keep = 'first', inplace = True)
Adding a function in Python is simple. Just use the def keyword to declare the function and put your existing code under it (indented). Put parameters in the parenthesis.
Here is the updated code:
def GetUsers (title, tag)
df = pd.read_sql_query(""" select etc..... """, con)
df1 = pd.read_sql_query(""" select etc..... """, con)
df1['user_id'] = df1['user_id'].apply(str)
df2 = pd.merge(df, df1, left_on='user_id', right_on='user_id', how='left')
tag = df2[
(df2['choiceTitle'].str.contains("(?i)" + title)) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)" + tag))]
dw = tag[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
dw = dw.sort_values(['last_login'], ascending=[False])
dw = dw[dw.last_login > dt.datetime.now() - pd.to_timedelta("30day")]
dw = dw.rename({'user': 'user full name', 'title': 'user title'}
, axis='columns')
dw.drop_duplicates(subset ="Email",
keep = 'first', inplace = True)
return dw # send back to print statement
# get input from user
inpTitle = input ("choiceTitle? ")
inpTag = input ("PrimaryTagPreviousChoice? ")
# run function
result = GetUsers (inpTitle, inpTag)
print(result)
Try this. Save your input() as variables and use string concatenation to edit your mask. Note that an additional set of {} is needed for escaping.
choiceTitle = input('choiceTitle?')
PrimaryTagPreviousChoice = input('PrimaryTagPreviousChoice?')
mask = df2[(df2['choiceTitle'].str.contains("(?i){{0}}".format(choiceTitle))) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)
{{0}}".format(PrimaryTagPreviousChoice)))]
dw = mask[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
....

Python 3.7 KeyError

I like to retrieve information from NewsApi and ran into an issue. Enclosed the code:
from NewsApi import NewsApi
import pandas as pd
import os
import datetime as dt
from datetime import date
def CreateDF(JsonArray,columns):
dfData = pd.DataFrame()
for item in JsonArray:
itemStruct = {}
for cunColumn in columns:
itemStruct[cunColumn] = item[cunColumn]
# dfData = dfData.append(itemStruct,ignore_index=True)
# dfData = dfData.append({'id': item['id'], 'name': item['name'], 'description': item['description']},
# ignore_index=True)
# return dfData
return itemStruct
def main():
# access_token_NewsAPI.txt must contain your personal access token
with open("access_token_NewsAPI.txt", "r") as f:
myKey = f.read()[:-1]
#myKey = 'a847cee6cc254d8495632f83d5c77d39'
api = NewsApi(myKey)
# get sources of news
# columns = ['id', 'name', 'description']
# rst_source = api.GetSources()
# df = CreateDF(rst_source['sources'], columns)
# df.to_csv('source_list.csv')
#
#
# # get news for specific country
# rst_country = api.GetHeadlines()
# columns = ['author', 'publishedAt', 'title', 'description','content', 'url']
# df = CreateDF(rst_country['articles'], columns)
# df.to_csv('Headlines_country.csv')
# get news for specific symbol
symbol = "coronavirus"
sources = 'bbc.co.uk'
columns = ['author', 'publishedAt', 'title', 'description', 'content', 'source']
limit = 500 # maximum requests per day
i = 1
startDate = dt.datetime(2020, 3, 1, 8)
# startDate = dt.datetime(2020, 3, 1)
df = pd.DataFrame({'author': [], 'publishedAt': [], 'title': [], 'description': [], 'content':[], 'source': []})
while i < limit:
endDate = startDate + dt.timedelta(hours=2)
rst_symbol = api.GetEverything(symbol, 'en', startDate, endDate, sources)
rst = CreateDF(rst_symbol['articles'], columns)
df = df.append(rst, ignore_index=True)
# DF.join(df.set_index('publishedAt'), on='publishedAt')
startDate = endDate
i += 1
df.to_csv('Headlines_symbol.csv')
main()
I got following error:
rst = CreateDF(rst_symbol['articles'], columns)
KeyError: 'articles'
In this line:
rst = CreateDF(rst_symbol['articles'], columns)
I think there is some problem regarding the key not being found or defined - does anyone has an idea how to fix that? I'm thankful for every hint!
MAiniak
EDIT:
I found the solution after I tried a few of your hints. Apparently, the error occurred when the NewsAPI API key ran into a request limit. This happened every time, until I changed the limit = 500 to limit = 20. For some reason, there is no error with a new API Key and reduced limit.
Thanks for your help guys!
Probably 'articles' is not one of your columns in rst_symbol object.
The python documentation [2] [3] doesn't mention any method named NewsApi() or GetEverything(), but rather NewsApiClient() and get_everything(), i.e.:
from newsapi import NewsApiClient
# Init
newsapi = NewsApiClient(api_key='xxx')
# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(q='bitcoin',
sources='bbc-news,the-verge',
category='business',
language='en',
country='us')
# /v2/everything
all_articles = newsapi.get_everything(q='bitcoin',
sources='bbc-news,the-verge',
domains='bbc.co.uk,techcrunch.com',
from_param='2017-12-01',
to='2017-12-12',
language='en',
sort_by='relevancy',
page=2)
# /v2/sources
sources = newsapi.get_sources()

Categories