Q : Python CSV - Key Error when plotting - python

What I'm trying to plot a a dataframe but I'm encountering some errors that I don't know how to solve.
Python Code:
import numpy as np
from datetime import date,time,datetime
import pandas as pd
import csv
df = pd.read_csv('MainD2.csv', parse_dates=['Time_Stamp'], infer_datetime_format=True)
df["Time_Stamp"] = pd.to_datetime(df["Time_Stamp"]) # convert to Datetime
df_filter = df[df["Curr"].le(3.0)] # new df with less or equal to 0.5
#print(df_filter)
where = (df_filter[df_filter["Time_Stamp"].diff().dt.total_seconds() > 1] ["Time_Stamp"] - pd.Timedelta("1s")).astype(str).tolist() # Find where diff > 1 second
df_filter2 = df[df["Time_Stamp"].isin(where)] # Create new df with those
#print(df_filter2)
df_filter2["AC_Input_Current"] = 0.0 # Set c1 to 0.0
#df_filter2
df = df.set_index("Time_Stamp")
df_filter2 = df_filter2.set_index("Time_Stamp")
df.loc[df_filter2.index] = df_filter2
def getMask(start,end):
mask = (df['Time_Stamp'] > start) & (df['Time_Stamp'] <= end)
return mask;
start = '2017-06-26 01:05:00'
end = '2017-06-26 01:20:00'
timerange = df.loc[getMask(start, end)]
timerange.plot(x='Time_Stamp', y='AC_Input_Current', style='-', color='black')*
*------------------ Plotting Part -------------------
timerange.plot(x='Time_Stamp', y='AC_Input_Current', style='-', color='black')
I have encountered this error when trying to plot :
KeyError: 'Time_Stamp'

Related

python warning: Boolean Series key will be reindexed to match DataFrame index

Below code report a warning:
UserWarning: Boolean Series key will be reindexed to match DataFrame index.
ds = ds[(df[['ts']].diff() > threshold).any(axis=1)]
2019-08-31 08:18:57.731 python[58541:1317145] [QL] Can't get plugin bundle info at file:///Users/e12714/Library/QuickLook/NSQuickLookPlugin.qlgenerator
Code:
#!/usr/bin/env python
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
from collections import OrderedDict
m = OrderedDict()
m["08-30 22:30:10.063"] = 5
m["08-30 22:30:15.023"] = 5
m["08-30 22:30:20.043"] = 5
m["08-30 22:30:25.015"] = 2
m["08-30 22:30:25.020"] = 2
m["08-30 22:30:26.025"] = 2
m["08-30 22:30:40.032"] = 5
m["08-30 22:30:45.045"] = 5
m["08-30 22:30:50.022"] = 5
df = pd.DataFrame(list(m.items()), columns = ['ts', 'value'])
df['ts'] = [datetime.strptime(x,'%m-%d %H:%M:%S.%f') for x in df['ts']]
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(df['ts'],df['value'],"--.")
dl = df[(df[['value']].shift() != df[['value']]).any(axis=1)]
dr = df[(df[['value']].shift(-1) != df[['value']]).any(axis=1)]
ds = pd.concat([dl,dr],ignore_index=True)
ds = ds.sort_values(['ts'])
threshold = timedelta(seconds=2)
ds = ds[(df[['ts']].diff() > threshold).any(axis=1)]
fig.autofmt_xdate()
ax.xaxis.set_ticks(np.array(ds['ts']))
ax.yaxis.grid(True)
plt.show()
Output sounds good:
How to fix this warning?
Change the line of code of
ds = ds[(df[['ts']].diff() > threshold).any(axis=1)]
to
ds = ds[(ds[['ts']].diff() > threshold).any(axis=1)]

I do not know why the graph is not compiling and executing code for long time

There is no error in code ( I believe)
but when I run the program, the graph does not print in the plot. It just says executing code
and i've waited like an hour but doesn't show anything.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
excel_df = pd.read_csv('data.csv', header=None)
bool_idx = excel_df < 0.006
valid_data = excel_df[bool_idx]
true_data = valid_data.dropna()
tt = np.array(true_data.iloc[0:-1, 0])
print(tt)
tt2 = np.array(true_data.iloc[1:, 0])
print(tt2)
ts = abs(tt - tt2)
print(ts)
ind = np.array(np.where([ts < 0.001]))
graph1 = plt.plot(ind)
print(ind)
true_data0001 = true_data.iloc[0, ind]
print(true_data0001)
no error

How to apply euclidean distance to dataframe. Calculate each row

Please help me, I have the problem. It's been about 2 weeks but I don't get it yet.
So, I want to use "apply" in dataframe, which I got from Alphavantage API.
I want to apply euclidean distance to each row of dataframe.
import math
import numpy as np
import pandas as pd
from scipy.spatial import distance
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.neighbors import KNeighborsRegressor
from alpha_vantage.timeseries import TimeSeries
from services.KEY import getApiKey
ts = TimeSeries(key=getApiKey(), output_format='pandas')
And in my picture I got this
My chart (sorry can't post image because of my reputation)
In my code
stock, meta_data = ts.get_daily_adjusted(symbol, outputsize='full')
stock = stock.sort_values('date')
open = stock['1. open'].values
low = stock['3. low'].values
high = stock['2. high'].values
close = stock['4. close'].values
sorted_date = stock.index.get_level_values(level='date')
stock_numpy_format = np.stack((sorted_date, open, low
,high, close), axis=1)
df = pd.DataFrame(stock_numpy_format, columns=['date', 'open', 'low', 'high', 'close'])
df = df[df['open']>0]
df = df[(df['date'] >= "2016-01-01") & (df['date'] <= "2018-12-31")]
df = df.reset_index(drop=True)
df['close_next'] = df['close'].shift(-1)
df['daily_return'] = df['close'].pct_change(1)
df['daily_return'].fillna(0, inplace=True)
stock_numeric_close_dailyreturn = df['close', 'daily_return']
stock_normalized = (stock_numeric_close_dailyreturn - stock_numeric_close_dailyreturn.mean()) / stock_numeric_close_dailyreturn.std()
euclidean_distances = stock_normalized.apply(lambda row: distance.euclidean(row, date_normalized) , axis=1)
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx":euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_date = df.loc[int(second_smallest)]["date"]
And I want that my chart like this
The chart that I want
And the code from this picture
distance_columns = ['Close', 'DailyReturn']
stock_numeric = stock[distance_columns]
stock_normalized = (stock_numeric - stock_numeric.mean()) / stock_numeric.std()
stock_normalized.fillna(0, inplace = True)
date_normalized = stock_normalized[stock["Date"] == "2016-06-29"]
euclidean_distances = stock_normalized.apply(lambda row: distance.euclidean(row, date_normalized), axis = 1)
distance_frame = pandas.DataFrame(data = {"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_date = stock.loc[int(second_smallest)]["Date"]
I tried to figure it out, the "apply" in the df.apply from pandas format and from pandas.csv_reader is different.
Is there any alternative to have same output in different format (pandas and csv)
Thank you!
nb: sorry if my english bad.

index out of bounds// Python, dataframe, plot

I want to plot the point with max value from dataframe.
import pandas as pd
import matplotlib.pyplot as plt
dane = pd.read_table('C:\\xxx.txt', names=('rok', 'kroliki', 'lisy', 'marchewki'))
df = pd.DataFrame(dane)
data = df[1:]
data=data.astype(float)
x = int(data['kroliki'].max())
y = int(data['lisy'].max())
z = int(data['marchewki'].max())
p= data['rok'].where(data['kroliki'] == x)
q = data['rok'].where(data['lisy'] == y)
r = data['rok'].where(data['marchewki'] == z)
p1 = int(p[p.notnull()])
q1 = int(q[q.notnull()])
r1 = int(r[r.notnull()])
point = pd.DataFrame({'x':[p1],'y':[q1],'z':[r1]})
point.plot((p1,x),(q1,y),(r1,z))
I have such an error:
IndexError: index 1993 is out of bounds for axis 0 with size 4
May somebody know what is wrong with this code?
Thanks
I think that when you use Pandas to plot, it will look for indices within itself and not for values.
So, in your case, when you do:
point.plot(p1,x)
Pandas will look for the index 1993 in the x-direction, i.e, throughout all columns. In other words, you should have 1993 columns.
I tried to reproduce your problem as follows:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0,100,size=(10, 4)), columns=('rok', 'kroliki', 'lisy', 'marchewki'))
data = df[1:]
data=data.astype(float)
x = int(data['kroliki'].max())
y = int(data['lisy'].max())
z = int(data['marchewki'].max())
p = data['rok'].where(data['kroliki'] == x)
q = data['rok'].where(data['lisy'] == y)
r = data['rok'].where(data['marchewki'] == z)
p1 = int(p[p.notnull()])
q1 = int(q[q.notnull()])
r1 = int(r[r.notnull()])
point = pd.DataFrame({'x':[p1],'y':[q1],'z':[r1]})
point.plot((p1,x),(q1,y),(r1,z))
I get the following error:
>>> AttributeError: 'tuple' object has no attribute 'lower'
And when I run each point separately:
>>>> IndexError: index 85 is out of bounds for axis 0 with size 3
To solve it:
import matplotlib.pyplot as plt
plt.plot((point.x, point.y, point.z), (x,y,z),'ko')
And I got the following result:
Hope it helps.

Pandas Time Series and groupby

[Edited to more clearly state root problem, which behaves differently if you use numpy 1.8 as dmvianna points out]
I have a DataFrame that has time stamps add other data. In the end I would like to not use a formatted time as the index because it messes with matplotlibs 3d plotting. I also want to preform a groupby to populate some flag fields. This is causing me to run into a number of weird errors. The first two work as I would expect. Once I bring pd.to_datetime into the picture it starts throwing errors.
runs as expected:
import pandas as pd
import numpy as np
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
works fine:
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df.time = pd.to_datetime(df.time, unit='s')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
throws error:
ValueError: Shape of passed values is (3, 1016), indices imply (3, 1000)
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df.time = pd.to_datetime(df.time, unit='s')
df = df.set_index('time')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
throws error:
ValueError: Shape of passed values is (3, 1016), indices imply (3, 1000)
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df['epoch'] = df.time
df.time = pd.to_datetime(df.time, unit='s')
df = df.set_index('time')
df = df.set_index('epoch')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
Anyone know what I'm missing / doing wrong?
Instead of using pd.to_datetime, I would use np.datetime64. It will work in columns and offers the same functionality as you expect from a datetime.index (np.datetime64 is a building block for datetime.index).
import numpy as np
data['time2'] = np.datetime64(data.time, 's')
Check the Docs
This would also lead to the same result:
import pandas as pd
data['time2'] = pd.to_datetime(data.time, unit='s')
Notice though that I'm using pandas 0.12.0 and Numpy 1.8.0. Numpy 1.7 has issues referred to in the comments below.

Categories