How to pass a range in a def function - python

Want to pass a range for the web-scraping function, not sure how it's done. This is to make my code more reusable so that I can scrape different ranges with different dates, say 2016... 2017... 2018... Code looks like this:
import numpy as np
import pandas as pd
import requests
def game_id2017(game_id):
games_played_2017 = []
games_played_2018 = []
print('Getting data...')
for game_id in range():
url = 'https://statsapi.web.nhl.com/api/v1/game/{}/boxscore'.format(game_id)
r_2017 = requests.get(url)
game_data_2017 = r_2017.json()
for homeaway in ['home','away']:
game_dict_2017 = dict()
game_dict_2017['team'] = game_data_2017.get('teams').get(homeaway).get('team').get('name')
game_dict_2017['teamID'] = game_data_2017.get('teams').get(homeaway).get('team').get('id')
game_dict_2017['homeaway'] = homeaway
game_dict_2017['game_id'] = game_id
games_played_2017.append(game_dict_2017)
game_id2017(20170201, 20170210, 1)
TypeError: game_id2017() takes 1 positional argument but 3 were given

game_id2017(*game_id)
for id in game_id:
then use game_id like a list

Pass a list:
import numpy as np
import pandas as pd
import requests
def game_id2017(game_id):
print('Getting data...')
for a_game_id in range(len(game_id)):
# use a_game_id
game_id2017([20170201, 20170210, 1])

Related

Adding Column to data frame based on list content in a loop? - Python

I'm pulling data from the NHL API for player stats based on individual games. I'm trying to make a loop that calls the data, parses the JSON, creates a dict which I then can create a data frame from for an entire team. The code before my looping looks like this:
API_URL = "https://statsapi.web.nhl.com/api/v1"
response = requests.get(API_URL + "/people/8477956/stats?stats=gameLog", params={"Content-Type": "application/json"})
data = json.loads(response.text)
df_list_dict = []
for game in data['stats'][0]['splits']:
curr_dict = game['stat']
curr_dict['date'] = game['date']
curr_dict['isHome'] = game['isHome']
curr_dict['isWin'] = game['isWin']
curr_dict['isOT'] = game['isOT']
curr_dict['team'] = game['team']['name']
curr_dict['opponent'] = game['opponent']['name']
df_list_dict.append(curr_dict)
df = pd.DataFrame.from_dict(df_list_dict)
print(df)
This gives me a digestible data frame for a single player. (/people/{player}/....
I want to iterate through a list (the list being an NHL team), while adding a column that identifies the player and concatenates the created data frames. My attempt thus far looks like this:
import requests
import json
import pandas as pd
Rangers = ['8478550', '8476459', '8479323', '8476389', '8475184', '8480817', '8480078', '8476624', '8481554', '8482109', '8476918', '8476885', '8479324',
'8482073', '8479328', '8480833', '8478104', '8477846', '8477380', '8477380', '8477433', '8479333', '8479991']
def callapi(player):
response = (requests.get(f'https://statsapi.web.nhl.com/api/v1/people/{player}/stats?stats=gameLog', params={"Content-Type": "application/json"}))
data = json.loads(response.text)
df_list_dict = []
for game in data['stats'][0]['splits']:
curr_dict = game['stat']
curr_dict['date'] = game['date']
curr_dict['isHome'] = game['isHome']
curr_dict['isWin'] = game['isWin']
curr_dict['isOT'] = game['isOT']
curr_dict['team'] = game['team']['name']
curr_dict['opponent'] = game['opponent']['name']
df_list_dict.append(curr_dict)
df = pd.DataFrame.from_dict(df_list_dict)
print(df)
for player in Rangers:
callapi(player)
print(callapi)
When this is printed I can see all the data frames that were created. I cannot use curr_dict[] to add a column based on the list position (the player ID) because must be a slice or integer, not string.
What I'm hoping to do is make this one data frame in which the stats are identified by a player id column.
My python knowledge is very scattered, I feel as if with the progress I've made I should know how to complete this but I've simply hit a wall. Any help would be appreciated.
You can use concurrent.futures to parallelize the requests before concatenating them all together, and json_normalize to parse the json.
import concurrent.futures
import json
import os
import pandas as pd
import requests
class Scrape:
def main(self) -> pd.DataFrame:
rangers = ["8478550", "8476459", "8479323", "8476389", "8475184", "8480817", "8480078",
"8476624", "8481554", "8482109", "8476918", "8476885", "8479324", "8482073",
"8479328", "8480833", "8478104", "8477846", "8477380", "8477380", "8477433",
"8479333", "8479991"]
with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
return pd.concat(executor.map(self.get_stats, rangers)).reset_index(drop=True).fillna(0)
#staticmethod
def get_stats(player: str) -> pd.DataFrame:
url = f"https://statsapi.web.nhl.com/api/v1/people/{player}/stats?stats=gameLog"
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
data = json.loads(response.text)
df = (pd.
json_normalize(data=data, record_path=["stats", "splits"])
.rename(columns={"team.id": "team_id", "team.name": "team_name",
"opponent.id": "opponent_id", "opponent.name": "opponent_name"})
).assign(player_id=player)
df = df[df.columns.drop(list(df.filter(regex="link|gamePk")))]
df.columns = df.columns.str.split(".").str[-1]
if "faceOffPct" not in df.columns:
df["faceOffPct"] = 0
return df
if __name__ == "__main__":
stats = Scrape().main()
print(stats)

TypeError: list indices must be integers or slices, not Tag. One of my loops isn't working

The ultimate goal of this is to output select data columns to a .csv. I had it working once to where it only got the first table on the page but I needed both. Now it says this. Im quite new to python and IDK how I got to this point in the first place. I needed the call and put table but on the web page the calls came first and when I did .find I only got the calls. I am working on this with a friend and he put in the last two functions. He could get the columns I wanted but now we only get the calls. I tried to fix it and now it say the error in the title.
import bs4
import requests
import pandas as pd
import csv
from bs4 import BeautifulSoup
#sets desired ticker. in the future you could make this long
def ticker():
ticker = ['GME','NYMT']
return ticker
#creates list of urls for scrapet to grab
def ticker_site():
ticker_site = ['https://finance.yahoo.com/quote/'+x+'/options?p='+x for x in ticker()]
return ticker_site
optionRows = []
for i in range(len(ticker_site())):
optionRows.append([])
def ticker_gets():
option_page = ticker_site()
requested_page = requests.get(option_page[i])
ticker_soup = BeautifulSoup(requested_page.text,'html.parser')
return ticker_soup
def soup_search():
table = ticker_gets()
both_tables = table.find_all('table')
call_table = both_tables[0]
put_table= both_tables[1]
call_rows = call_table.find('tr')
put_rows = put_table.find('tr')
#makes the call table
for call in call_rows:
whole_call_table = call.find_all('td')
call_row = [y.text for y in whole_call_table]
optionRows[call].append(call_row)
#makes the put table
for put in put_rows:
whole_put_table = put.find_all('td')
put_row = [z.text for z in whole_put_table]
optionRows[put].append(put_row)
for i in range(len(optionRows)):
optionRows[i] = optionRows[i][1:len(optionRows[i])]
return optionRows
def getColumns(columnIndexes=[2, 4, 5]):
newList = []
for tickerIndex in range(len(soup_search())):
newList.append([])
indexCount = 0
for j in soup_search()[tickerIndex]:
newList[tickerIndex].append([])
for i in columnIndexes:
newList[tickerIndex][indexCount].append(j[i])
indexCount += 1
return newList
def csvOutputer():
rows = getColumns()
fields = ["Ticker", "Strike", "Bid", "Ask"]
with open('newcsv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
for i in range(len(ticker())):
for j in rows[i]:
j.insert(0, ticker()[i])
write.writerow(j)
csvOutputer()

python requests-html get id value of element

I've benn playing with an example taken from here:
https://stackoverflow.com/a/61408325
this is working and was very helpfull, but I'm strugling with the requests-html documentation.
In this example is it possible to get the id value of the element?
from requests_html import AsyncHTMLSession
from collections import defaultdict
import pandas as pd
url = 'https://www.flashscore.com/football/england/premier-league-2018-2019/results/'
asession = AsyncHTMLSession()
async def get_scores():
r = await asession.get(url)
await r.html.arender()
return r
results = asession.run(get_scores)
results = results[0]
times = results.html.find("div.event__time")
home_teams = results.html.find("div.event__participant.event__participant--home")
scores = results.html.find("div.event__scores.fontBold")
away_teams = results.html.find("div.event__participant.event__participant--away")
event_part = results.html.find("div.event__part")
dict_res = defaultdict(list)
for ind in range(len(times)):
dict_res['times'].append(times[ind].text)
dict_res['home_teams'].append(home_teams[ind].text)
dict_res['scores'].append(scores[ind].text)
dict_res['away_teams'].append(away_teams[ind].text)
dict_res['event_part'].append(event_part[ind].text)
df_res = pd.DataFrame(dict_res)
I managed to get the id in a way I don't know if it will be the most suitable.
What I did was search for the of the entire game
match_div = results.html.find("div.event__match")
and then get the id from its atributes
for ind in range(len(times)):
id = match_div[ind].attrs['id']
I think that must be a more 'direct' way of doing this, but not getting there

Return DataFrame using ipywidgets Button

I'm currently creating a Class that inherits a DataFrame from pandas. I'm interested in developing a method called 'new_filter' that is a fancier execution of a DataFrame command:
import pandas as pd
from ipywidgets import widgets
from IPython.display import display
import numpy as np
class Result(pd.DataFrame):
#property
def _constructor(self):
return Result
def _filter_done(self, c):
self._column_name = self._filter_dd.value
self._expression = self._filter_txt.value
return self[eval('self.'+ self._column_name +' '+self._expression)]
def new_filter(self):
self._filter_dd = widgets.Dropdown(options=list(self.columns),
description='Column:')
self._filter_txt = widgets.Text(description='Expr:')
self._filter_button = widgets.Button(description = 'Done')
self._filter_box = widgets.VBox([self._filter_dd, self._filter_txt, self._filter_button])
display(self._filter_box)
self._filter_button.on_click(self._filter_done)
After creating an object like:
test = Result(np.random.randn(3,4), columns=['A','B','C','D']) #just an example
test_2 = test.new_filter()
Then, for example:
Widget Output
What I want is that 'test_2' be an object from 'Result' class. Is there any solution to this?
First, you will have to return something in the function new_filter. Second, if you want the same object to be modified, it is a bit hard I think. One thing you can do is to have an object which has a trait which can be updated in _filter_done.
Here is a small example of how you can do it:
import pandas as pd
from ipywidgets import widgets
from IPython.display import display
import numpy as np
class Result(pd.DataFrame):
#property
def _constructor(self):
return Result
def _filter_done(self, obj, c):
## obj is the obejct to be modified.
## Updating its data attribute to have the filtered data.
self._column_name = self._filter_dd.value
self._expression = self._filter_txt.value
obj.data = self[eval('self.'+ self._column_name +' '+self._expression)]
def new_filter(self):
self._filter_dd = widgets.Dropdown(options=list(self.columns),
description='Column:')
self._filter_txt = widgets.Text(description='Expr:')
self._filter_button = widgets.Button(description = 'Done')
self._filter_box = widgets.VBox([self._filter_dd, self._filter_txt, self._filter_button])
display(self._filter_box)
result_obj = FilterResult()
self._filter_button.on_click(lambda arg: self._filter_done(result_obj, arg))
return result_obj
from traitlets import HasTraits
from traittypes import DataFrame
class FilterResult(HasTraits):
data = DataFrame()
With the same example code as in your question, i.e.,
test = Result(np.random.randn(3,4), columns=['A', 'B', 'C','D']) #just an example
test_2 = test.new_filter()
You can see that whenever you click on done, the updated dataframe is in test_2.data.

How to solve "TypeError: expected string or buffer" when importing json data via api?

I'm trying to import JSON data via an API, and use the imported data to construct a DataFrame.
import json
import pandas as pd
import numpy as np
import requests
api_username = 'acb'
api_password = 'efg'
germany_name = 'Germany'
germany_api_url = "https://api.country_data.com/stats/?country=" + germany_name + "&year=2014"
germany_api_resp = requests.get(germany_api_url,auth=(api_username,api_password))
germany_data_json = json.loads(germany_api_resp)
germany_frame = pd.DataFrame(germany_data_json['data']).set_index('tag')
print(germany_frame) shows me the desired DataFrame.
I want to repeat the process for many countries, not just 'Germany', so I created a country object like this:
class Country(object):
def __init__(self,name):
self.name = name
self.api_url = "https://api.country_data.com/stats/?country=" + name + "&year=2014"
self.api_resp = requests.get(self.api_url,auth=(api_username,api_password))
self.data_json = json.loads(self.api_resp)
self.frame = pd.DataFrame(self.data_json['data']).set_index('tag')
When I create my first object, like this:
Germany = Country('Germany')
I get an Error message:
TypeError: expected string or buffer
Can someone help me with this issue?
I don't which version of Python you're using, and which version of requests but I recommend to you to update everything. Here is a error I found :
self.data_json = json.loads(self.api_resp)
You try to load in a json-way a Response from requests, so change it to :
self.data_json = self.api_resp.json()
I replaced your api url to another because yours is wrong and it works for me.
See ya !

Categories