Python Pandas throws error when taking in variable but not value

Python Pandas throws error when taking in variable but not value - python

import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
m
This returns the following data
price 99614.04
dtype: float64
now when I try to use the variable 'm' I receive the following error
import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
df.loc[(df['market'] == "NASDAQ") & (df["price"] == m)]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_62032/1757287628.py in <module>
5 df["date"] = pd.to_datetime(df["date"])
6 m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
----> 7 df.loc[(df['market'] == "NASDAQ") & (df["price"] == m)]
~\Anaconda3\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
67 other = item_from_zerodim(other)
68
---> 69 return method(self, other)
70
71 return new_method
~\Anaconda3\lib\site-packages\pandas\core\arraylike.py in __eq__(self, other)
30 #unpack_zerodim_and_defer("__eq__")
31 def __eq__(self, other):
---> 32 return self._cmp_method(other, operator.eq)
33
34 #unpack_zerodim_and_defer("__ne__")
~\Anaconda3\lib\site-packages\pandas\core\series.py in _cmp_method(self, other, op)
5494
5495 if isinstance(other, Series) and not self._indexed_same(other):
-> 5496 raise ValueError("Can only compare identically-labeled Series objects")
5497
5498 lvalues = self._values
ValueError: Can only compare identically-labeled Series objects
But when I use the actual value for 'm' it works.
import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
df.loc[(df['market'] == "NASDAQ") & (df["price"] == 99614.04)]
id name price symbol industry market currency date
25 1abf2ffc-3396-4ed9-954d-956be97668c0 Brocade Communications Systems, Inc. 99614.04 BRCD Computer Communications Equipment NASDAQ PLN 2020-09-12
Could someone please explain why this interaction is playing out this way?

Return value is a Series, you can use
m = df.loc[df['market'] == "NASDAQ", 'price'].max(numeric_only=True)
# or
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True).item()

Use the following instead, currently you're returning a Series because you're not specifying from which Column you want to take the max.
m = df[df['market'].eq("NASDAQ")]['price'].max()

Related

How to handle invalid date string input

My project is relatively straight forward. I am attempting to create a web-scraping tool that retrieves a random event from any given wikipedia article for a given date. The format of the URL is: url = f"https://en.wikipedia.org/wiki/{month}_{day}" where the month is the full name of the month followed by the day.
What I'm trying to achieve:
What I'm trying to achieve specifically here is that if an invalid date such as June 31 or Feb 30 were input, then the function below stops and returns a Please provide a valid date as its output without an error message.
Attempted Solution:
I've tried this with an if statement mapping a set of months to a set of dates but it's pretty wonky, as shown bellow:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
month = 'January'
day = '99'
url = f"https://en.wikipedia.org/wiki/{month}_{day}"
thirty = [*range(1,31)]
thirty = [str(x) for x in thirty]
thirty_one = [*range(1,32)]
thirty_one = [str(x) for x in thirty_one]
twenty_nine = [*range(1,30)]
twenty_nine = [str(x) for x in twenty_nine]
soup = BeautifulSoup(requests.get(url).content, "html.parser")
def wikiscraper():
events = []
if (month == set(['April','June','September','November']) and day != set(thirty))| \
(month == set(['January','March','May','July','August','October','December']) and day != set(thirty_one))| \
(month == set(['February']) and day != set(twenty_nine)):
return print("Please provide a valid date")
else:
for li in soup.select("h3 + ul > li"):
if (h2 := li.find_previous("h2")) and (h2.find(id="Events")):
date, event = li.text.replace("–", "-").split(" - ", maxsplit=1)
events.append((date, event))
events = pd.DataFrame(events)
cols = ['year','event']
events = pd.DataFrame(events)
events.columns = cols
pd.options.display.max_colwidth = 300
events['combined'] = 'On this date in the year'+' '+events.year+' '+events.event
events = events[['combined']]
return events.sample()
wikiscraper()
which returns
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-130-b89d9001cf84> in <module>
----> 1 wikiscraper()
<ipython-input-129-2c547a995093> in wikiscraper()
18 cols = ['year','event']
19 events = pd.DataFrame(events)
---> 20 events.columns = cols
21
22 pd.options.display.max_colwidth = 300
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in __setattr__(self, name, value)
5150 try:
5151 object.__getattribute__(self, name)
-> 5152 return object.__setattr__(self, name, value)
5153 except AttributeError:
5154 pass
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in _set_axis(self, axis, labels)
562 def _set_axis(self, axis: int, labels: Index) -> None:
563 labels = ensure_index(labels)
--> 564 self._mgr.set_axis(axis, labels)
565 self._clear_item_cache()
566
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in set_axis(self, axis, new_labels)
224
225 if new_len != old_len:
--> 226 raise ValueError(
227 f"Length mismatch: Expected axis has {old_len} elements, new "
228 f"values have {new_len} elements"
ValueError: Length mismatch: Expected axis has 0 elements, new values have 2 elements
I also tried it with assert which works, but I want to keep it clean without an assertion error and just have a printed output requesting a valid date. The if statement I'm sure is not a very "pythonic" way of doing it either, although getting to run with the desired output is the bigger priority.
My ultimate goal is to simply get the function to stop and Please provide a valid date if the string input is not compatible with real dates.
edit. The solution was simple
if (month in set(['April','June','September','November']) and day not in set(thirty))| \
(month in set(['January','March','May','July','August','October','December']) and day not in set(thirty_one))| \
(month in set(['February']) and day not in set(twenty_nine)):
return print("Please provide a valid date")
just use in and not == User Timus to the rescue on a dumb mistake.

Getting EIA data through API with python - json error

I am trying to get EIA data using its API, however I encountered json errors when it's calling the series. I recalled it was working fine about 6 months ago, not sure if it is something changed in EIA's API. Could anyone shed some light how to fix this?
Here's the code:
import pandas as pd
import eia
def retrieve_data():
# Create EIA API using your specific API key
api_key = "YOUR_API_KEY"
api = eia.API(api_key)
# Retrieve Data By Series ID
series_ID='STEO.PASC_OECD_T3.M'
series_search = api.data_by_series(series=series_ID)
df = pd.DataFrame(series_search)
df.index.names = ['Date']
df.columns=[ "Price"]
df.index = df.index.str.replace('^([\d]{4})\s([\d]{2})([\d]
{2})\s[\d] {2}', r'\1-\2-\3',regex=True)
df.index = pd.to_datetime(df.index)
return df
data = retrieve_data()
print(data)
and the error message is as the following:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/tmp/ipykernel_942387/4124051913.py in <module>
17 return df
18
---> 19 data = retrieve_data()
20 print(data)
21 #data.to_csv('OK_WTI_Spot_Price_FOB.csv',index=True)
/tmp/ipykernel_942387/4124051913.py in retrieve_data()
9 # Retrieve Data By Series ID
10 series_ID='STEO.PASC_OECD_T3.M'
---> 11 series_search = api.data_by_series(series=series_ID)
12 df = pd.DataFrame(series_search)
13 df.index.names = ['Date']
~/miniconda3/lib/python3.7/site-packages/eia/api.py in data_by_series(self, series)
422 else:
423 lst_dates = [x[0][0:4] + " " + x[0][4:] + " " + x[0][6:8]
--> 424 for x in search.json()['series'][0]['data']]
425 lst_values = [x[1] for x in
426 search.json()['series'][0]['data']]
KeyError: 'series'

why the function networkdays get the error "'Series' object has no attribute 'days'

Below is my sample dataframe, I would like to use networkdays to calculate the working day in columns in my df, but it got the issue as below, can someone help assist on this?
#import lib
import pandas as pd
from workdays import workday, networkdays
#sample dict to create df
dict1 = {
'startdate' : ['2022-01-17','2022-02-28'],
'enddate' : ['2022-01-17','2022-03-15']
}
#convert to datetime format
df['startdate'] = df['startdate'].astype('datetime64')
df['enddate'] = df['enddate'].astype('datetime64')
#create new column count and apply function
df['count']=df.apply(networkdays(df['startdate'],df['enddate']))
#getting error :
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_20080/333906513.py in <module>
----> 1 df['count']=df.apply(networkdays(df['startdate'],df['enddate']))
C:\ProgramData\Anaconda3\lib\site-packages\workdays.py in networkdays(start_date, end_date, holidays)
10
11 def networkdays(start_date, end_date, holidays=[]):
---> 12 delta_days = (end_date - start_date).days + 1
13 full_weeks, extra_days = divmod(delta_days, 7)
14 # num_workdays = how many days/week you work * total # of weeks
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
5485 ):
5486 return self[name]
-> 5487 return object.__getattribute__(self, name)
5488
5489 def __setattr__(self, name: str, value) -> None:
AttributeError: 'Series' object has no attribute 'days'

df['startdate'] is a series object, it contains more than one piece of data.
>> df['enddate'] # <class 'pandas.core.series.Series'>
0 2022-01-17
1 2022-03-15
You should pass a lambda to the df.apply function, if you have to apply a function to all the rows in the DataFrame. The parameter of the lambda is the row of data.
df = pd.DataFrame(dict1)
df['startdate'] = df['startdate'].astype('datetime64')
df['enddate'] = df['enddate'].astype('datetime64')
df['count']=df.apply(lambda x: networkdays(x['startdate'], x['enddate']), axis=1)
print(df)
# startdate enddate count
# 0 2022-01-17 2022-01-17 1
# 1 2022-02-28 2022-03-15 12

From multiples CSV to Dataframe columns with calculs

I got 10 csvfiles like this :
I want to add 10 columns in my dataframe with a vwap calculation. I tried to create the columns and then to concatenate it into the dataframe but it doesn't work at all. I tried a lot of things, the main problem is that i can't create new columns with calculated rows :
import pandas as pd
import os
import glob
from IPython.display import display, HTML
import csv
# use glob to get all the csv files
# in the folder
path = os.getcwd()
csv_files = glob.glob(os.path.join("*.csv"))
"""
#To change the name of every columns
liste1 = []
header_list = []
for f in csv_files:
liste1.append(f)
header_list = [a.strip(".csv") for a in liste1]
"""
def add(f):
df = pd.read_csv(f, header=0)
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean").reset_index()
price = df["price"]
amount = df["amount"]
return df.assign(vwap = (price * amount).cumsum() / amount.cumsum())
for f in csv_files:
df = pd.read_csv(f, header=0)
df2 = pd.concat(add(f))
df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", encoding='utf-8', index=False, mode = "a")
Thanks for your help
The traceback :
TypeError
Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_16732/557098648.py in <module>
31 for f in csv_files:
32 df = pd.read_csv(f, header=0)
---> 33 df2 = pd.concat(add(f))
34 df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", encoding='utf-8', index=False, mode = "a")
35
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
292 ValueError: Indexes have overlapping values: ['a']
293 """
--> 294 op = _Concatenator(
295 objs,
296 axis=axis,
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
327 ):
328 if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
--> 329 raise TypeError(
330 "first argument must be an iterable of pandas "
331 f'objects, you passed an object of type "{type(objs).__name__}"'
TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

If need only aggregate values in ouput:
def add(df):
#Removed read_csv
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean").reset_index()
price = df["price"]
amount = df["amount"]
return (price * amount).cumsum() / amount.cumsum()
out = []
for f in csv_files:
df = pd.read_csv(f, header=0)
#added aggregate DataFrame with new column to list of DataFrames
out.append(add(df))
#joined all dfs together
df2 = pd.concat(out, ignore_index=True, axis=1)
#removed append mode
df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv",
encoding='utf-8')

AttributeError when pulling data from Cryptocompare API

I am using this function to pull data from the Cryptocompare website into a pandas dataframe:
def daily_price_historical(symbol, comparison_symbol='USD', limit=1, aggregate=1, exchange='', allData='true'):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}&allData={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate, allData)
if exchange:
url += '&e={}'.format(exchange)
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time]
df.set_index('timestamp', inplace=True)
df['symbol'] = symbol
df['1dret'] = 100* df['close'].pct_change()
return df
This works fine for most symbols I pass in, but when I loop over a longer list of symbols I get the error: AttributeError: 'DataFrame' object has no attribute 'time'
I assume this is due to the API returning an error for certain symbols, e.g.:
https://min-api.cryptocompare.com/data/histoday?fsym=FAKE&tsym=USD
returns "Response":"Error" with no further data
I'm afraid I'm not very experienced with url requests/APIs. Is there code I can add to the function to skip the symbols that are causing the issue?
Thanks for your help!
Additional information:
Code used to loop over coins (which is a list of 130 symbols):
price_columns = ['close', 'high', 'low', 'open', 'time',
'volumefrom','volumeto', 'symbol', '1dret']
top_coin_prices = pd.DataFrame(columns=price_columns)
for coin in coins:
output = daily_price_historical(coin)
top_coin_prices = top_coin_prices.append(output)
Full Traceback:
AttributeError Traceback (most recent call last)
<ipython-input-277-126f5d1686b2> in <module>()
8 # populate df with data for all coins
9 for coin in coins:
---> 10 output = daily_price_historical(coin)
11 top_coin_prices = top_coin_prices.append(output)
12
<ipython-input-111-65b3fa76b4ab> in daily_price_historical(symbol, comparison_symbol, limit, aggregate, exchange, allData)
7 data = page.json()['Data']
8 df = pd.DataFrame(data)
----> 9 df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time]
10 df.set_index('timestamp', inplace=True)
11 df['symbol'] = symbol
/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
2968 if name in self._info_axis:
2969 return self[name]
-> 2970 return object.__getattribute__(self, name)
2971
2972 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'time'

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Pandas throws error when taking in variable but not value - python

Return value is a Series, you can use m = df.loc[df['market'] == "NASDAQ", 'price'].max(numeric_only=True) # or m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True).item()

Use the following instead, currently you're returning a Series because you're not specifying from which Column you want to take the max. m = df[df['market'].eq("NASDAQ")]['price'].max()

Related

How to handle invalid date string input

Getting EIA data through API with python - json error

why the function networkdays get the error "'Series' object has no attribute 'days'

From multiples CSV to Dataframe columns with calculs

AttributeError when pulling data from Cryptocompare API

Categories

Resources