Candlestick Plot from a Pandas DataFrame retry - python

I am new to matplotlib and need some guidance. I have being trying to reproduce this code from "Candlestick Plot from a Pandas DataFrame" as a way to learn by adding a "read_csv" function.
my error message keep saying "valueError: Length mismatch: Expected axis has 6 elements, new values have 5 elements"
my questions is:
what am I missing in the code? I read in the cvs, I use the right columns of data, and I understand there is a reset of index, but I don't know why it keeps getting an error.
please help.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from mpl_finance import candlestick_ohlc
import matplotlib.dates as mdates
import datetime as dt
df = pd.read_csv("/Users/paul/Documents/python (original)/Quant/sp500.csv", usecols=['Date', 'Open','High','Low','Close'])
#Reset the index to remove Date column from index
df_ohlc = df.reset_index()
#Naming columns
df_ohlc.columns = ["Date","Open","High",'Low',"Close"]
#Converting dates column to float values
df_ohlc['Date'] = df_ohlc['Date'].map(mdates.date2num)
#Making plot
fig = plt.figure()
ax1 = plt.subplot2grid((6,1), (0,0), rowspan=6, colspan=1)
#Converts raw mdate numbers to dates
ax1.xaxis_date()
plt.xlabel("Date")
print(df_ohlc)
#Making candlestick plot
candlestick_ohlc(ax1,df_ohlc.values,width=1, colorup='g', colordown='k',alpha=0.75)
plt.ylabel("Price")
plt.legend()
plt.show()

You don't need "0", "1", "2" before each line in your .csv file. You must first remove that and then:
If you're going to reset the index, you need an actual index column in your dataframe, so add index_col like this:
df = pd.read_csv("/Users/paul/Documents/python (original)/Quant/sp500.csv", usecols=['Date', 'Open','High','Low','Close'], index_col= 'Date')
Convert your date column from string to datetime :
df_ohlc['Date'] = pd.to_datetime(df_ohlc['Date'])
EDIT:
If you can't delete the 0, 1, 2... column in your csv file because it's too large, modify the first row to make the 'index' column appear like this:
'index', 'Date', 'Open','High','Low','Close'
Then, in your code:
df = pd.read_csv("/Users/paul/Documents/python (original)/Quant/sp500.csv", usecols=['index', 'Date', 'Open','High','Low','Close'], index_col="Date")
df.drop('index', axis=1, inplace=True)

Related

How to create a Boxplot with Timestamp using Matplotlib and Seaborn?

I have been trying to get a boxplot with each box representing an emotion over a period of time.
The data frame used to plot this contains timestamp and emotion name. I have tried converting the timestamp into a string first and then to datetime and finally to int64. This resulted in the gaps between x labels as seen in the plot. I have tried the same without converting to int64, but the matplotlib doesn't seem to allow the dates in the plot.
I'm attaching the code I have used here:
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib qt
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
data = pd.read_csv("TX-governor-sentiment.csv")
## check data types
data.dtypes
# drop rows with all missing values
data = data.dropna(how='all')
## transforming the timestamp column
#convert from obj type to string then to date type
data['timestamp2'] = data['timestamp']
data['timestamp2'] = pd.to_datetime(data['timestamp2'].astype(str), format='%m/%d/%Y %H:%M')
# convert to number format with the following logic:
# yyyymmddhourmin --> this allows us to treat dates as a continuous variable
data['timestamp2'] = data['timestamp2'].dt.strftime('%Y%m%d%H%M')
data['timestamp2'] = data['timestamp2'].astype('int64')
print (data[['timestamp','timestamp2']])
#data transformation for data from Orange
df = pd.DataFrame(columns=('timestamp', 'emotion'))
for index, row in data.iterrows():
if row['sentiment'] == 0:
df.loc[index] = [row['timestamp2'], 'Neutral']
else:
df.loc[index] = [row['timestamp2'], row['Emotion']]
# Plot using Seaborn & Matplotlib
#convert timestamp in case it's not in number format
df['timestamp'] = df['timestamp'].astype('int64')
fig = plt.figure(figsize=(10,10))
#colors = {"Neutral": "grey", "Joy": "pink", "Surprise":"blue"}
#visualize as boxplot
plot_ = sns.boxplot(x="timestamp", y="emotion", data=df, width=0.5,whis=np.inf);
#add data point on top
plot_ = sns.stripplot(x="timestamp", y="emotion", data=df, alpha=0.8, color="black");
fig.canvas.draw()
#modify ticks and labels
plt.xlim([202003010000,202004120000])
plt.xticks([202003010000, 202003150000, 202003290000, 202004120000], ['2020/03/01', '2020/03/15', '2020/03/29', '2020/04/12'])
#add colors
for patch in plot_.artists:
r, g, b, a = patch.get_facecolor()
patch.set_facecolor((r, g, b, .3))
Please let me know how I can overcome this problem of gaps in the boxplot. Thank you!

Plot using Pandas columns

Can someone help me with my problem because I am newby to pandas and I have been confused.
Initially I made some subset selections and everything OK with my new dataframe(which is type pandas.core.frame.DataFrame). My new dataframe has two columns (date, count) and I want to plot a line plot having the date at the x axis and the count on y axis.
Suppose the name of the data frame is df and the names of the columns are date and count according to pandas documentation the command is:
ts = pd.Series(df['count'], index = df['date'])
ts.plot()
where is the wrong?
any help
It's best to refer Pandas website for first hand information. However, you can try the below code out-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # For show command
# Creating a dummy dataframe (You can also go ahead with Series)
df = pd.DataFrame([45, 20], columns=['count'], index=['12/11/2018', '10/1/2018'])
# Converting string to datetime format
df.index = pd.to_datetime(df.index, format='%d/%m/%Y')
df.index
# DatetimeIndex(['2018-11-12', '2018-01-10'], dtype='datetime64[ns]', freq=None)
df.plot()
plt.show()

Plot pandas dataframe index formatted as Month-Year on x axis

I have a dataframe that I want the x axis to show as APR-2018 for example. The ax.format_xdata line does not do the trick.
import datetime as dt
import pandas as pd
import time
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
data = {("IVOG",1493510400000):{"Adj_Close":119.2136,"MA(3)":119.2136,"EWMA(3)":119.2136},
("IVOG",1496188800000):{"Adj_Close":120.8236,"MA(3)":120.0186,"EWMA(3)":120.0454},
("IVOG",1498780800000):{"Adj_Close":120.2736,"MA(3)":120.1036,"EWMA(3)":120.1266},
("IVOG",1501459200000):{"Adj_Close":121.7836,"MA(3)":120.5236,"EWMA(3)":120.5832},
("IVOG",1504137600000):{"Adj_Close":120.3536,"MA(3)":120.4896,"EWMA(3)":120.5309},
("IVOG",1506729600000):{"Adj_Close":124.3336,"MA(3)":121.1303,"EWMA(3)":121.2749}}
df=pd.DataFrame.from_dict(data, orient = 'index')
print(df)
ax = plt.gca() # get current axis
df.plot(kind='line',y='Adj_Close', ax=ax)
df.plot(kind='line',y='MA(3)',ax=ax)
df.plot(kind='line',y='EWMA(3)', color='green', ax=ax)
print(df.index[0][1])
ax.format_xdata = mdates.DateFormatter('%b-%Y') # Trying to get APR-2018
plt.xlabel(df.index[0][0]) # Trying to Get the Ticker
_=plt.grid()
_=plt.xticks(rotation=90)
plt.show()
The second index should be just the date and not time, but it incorrectly plots like this:Incorrect Plot
This should do the trick. Of course there are 'prettier' ways, but I have tried to make it so that you can keep your data and original data frame as close as to the original one in your question.
Edited after comments: so how about this, just create a new column with the date that you format in whatever shape you want. Then use set_xticklabels() passing that column to set the ticks as you want. Also you might want to remove the default plt.xlabel (otherwise you would have below your xticks the name of the indexes).
import datetime as dt
import pandas as pd
import time
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
# the first part of your code is the same
data = {("IVOG",1493510400000):{"Adj_Close":119.2136,"MA(3)":119.2136,"EWMA(3)":119.2136},
("IVOG",1496188800000):{"Adj_Close":120.8236,"MA(3)":120.0186,"EWMA(3)":120.0454},
("IVOG",1498780800000):{"Adj_Close":120.2736,"MA(3)":120.1036,"EWMA(3)":120.1266},
("IVOG",1501459200000):{"Adj_Close":121.7836,"MA(3)":120.5236,"EWMA(3)":120.5832},
("IVOG",1504137600000):{"Adj_Close":120.3536,"MA(3)":120.4896,"EWMA(3)":120.5309},
("IVOG",1506729600000):{"Adj_Close":124.3336,"MA(3)":121.1303,"EWMA(3)":121.2749}}
df=pd.DataFrame.from_dict(data, orient = 'index')
# first let's give a name to the indexes
df.index.names = ['ivog', 'timestamp']
# then create a new column with a datetime object
# (formatted to microseconds as your data seems to be)
df['date'] = pd.to_datetime(df.index.levels[1],
unit='ms')
# now let's change the date to the format you want
df['date'] = df['date'].apply(lambda x: x.strftime("%Y %B"))
print(df)
# plot the data just like you were doing
ax = plt.gca() # get current axis
df.plot(kind='line',y='Adj_Close', ax=ax)
df.plot(kind='line',y='MA(3)',ax=ax)
df.plot(kind='line',y='EWMA(3)', color='green', ax=ax)
# Now the x-axis label should be what you wished for
ax.set_xticklabels(df['date'])
plt.xlabel('Your x axis label')
plt.ylabel('Your y axis label')
plt.title('My Awseome Plot')
plt.xticks(rotation=45)

How to plot a dataframe using date as the x axis

I have a simple dataframe with two columns, 'date' and 'amount'. I want to plot the amount using date as the x-axis. The first lines of the data are:
22/05/2018,52068.67
21/05/2018,52159.19
15/05/2018,52744.03
08/05/2018,54666.21
08/05/2018,54677.51
01/05/2018,53890.59
30/04/2018,54812.25
27/04/2018,52258.23
26/04/2018,52351.47
23/04/2018,49777.04
23/04/2018,49952.44
23/04/2018,49992.44
05/04/2018,53238.59
03/04/2018,53631.09
03/04/2018,53839.64
28/03/2018,50836.78
26/03/2018,51206.67
26/03/2018,51372.02
14/03/2018,51110.17
12/03/2018,51411.31
06/03/2018,51169.91
05/03/2018,51374.57
27/02/2018,48728.85
27/02/2018,48730.5
16/02/2018,44988.25
14/02/2018,41948.03
12/02/2018,43776.31
12/02/2018,43800.31
12/02/2018,43840.11
05/02/2018,29358.96
26/01/2018,39491.0
24/01/2018,36470.03
23/01/2018,36562.76
23/01/2018,36616.61
22/01/2018,36582.46
22/01/2018,36665.71
22/01/2018,36743.31
17/01/2018,36965.3
16/01/2018,37044.6
09/01/2018,42083.65
08/01/2018,42183.39
05/01/2018,42285.41
03/01/2018,41537.51
03/01/2018,41579.51
02/01/2018,41945.32
27/12/2017,43003.33
27/12/2017,43217.29
18/12/2017,38208.63
15/12/2017,38315.53
However, the plot gives me points that don't appear in the data. For example, in May 2018 there is no value near 30000.
My code is:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("test.csv", header=None, names =['date', 'amount'])
df['time'] = pd.to_datetime(df['date'])
df.set_index(['time'],inplace=True)
df['amount'].plot()
plt.show()
What am I doing wrong?
You need to covert the dates to date time using correct format and use pandas plot
df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%Y')
df.plot('date', 'amount')

datetime x-axis matplotlib labels causing uncontrolled overlap

I'm trying to plot a pandas series with a 'pandas.tseries.index.DatetimeIndex'. The x-axis label stubbornly overlap, and I cannot make them presentable, even with several suggested solutions.
I tried stackoverflow solution suggesting to use autofmt_xdate but it doesn't help.
I also tried the suggestion to plt.tight_layout(), which fails to make an effect.
ax = test_df[(test_df.index.year ==2017) ]['error'].plot(kind="bar")
ax.figure.autofmt_xdate()
#plt.tight_layout()
print(type(test_df[(test_df.index.year ==2017) ]['error'].index))
UPDATE: That I'm using a bar chart is an issue. A regular time-series plot shows nicely-managed labels.
A pandas bar plot is a categorical plot. It shows one bar for each index at integer positions on the scale. Hence the first bar is at position 0, the next at 1 etc. The labels correspond to the dataframes' index. If you have 100 bars, you'll end up with 100 labels. This makes sense because pandas cannot know if those should be treated as categories or ordinal/numeric data.
If instead you use a normal matplotlib bar plot, it will treat the dataframe index numerically. This means the bars have their position according to the actual dates and labels are placed according to the automatic ticker.
import pandas as pd
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
datelist = pd.date_range(pd.datetime(2017, 1, 1).strftime('%Y-%m-%d'), periods=42).tolist()
df = pd.DataFrame(np.cumsum(np.random.randn(42)),
columns=['error'], index=pd.to_datetime(datelist))
plt.bar(df.index, df["error"].values)
plt.gcf().autofmt_xdate()
plt.show()
The advantage is then in addition that matplotlib.dates locators and formatters can be used. E.g. to label each first and fifteenth of a month with a custom format,
import pandas as pd
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
datelist = pd.date_range(pd.datetime(2017, 1, 1).strftime('%Y-%m-%d'), periods=93).tolist()
df = pd.DataFrame(np.cumsum(np.random.randn(93)),
columns=['error'], index=pd.to_datetime(datelist))
plt.bar(df.index, df["error"].values)
plt.gca().xaxis.set_major_locator(mdates.DayLocator((1,15)))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d %b %Y"))
plt.gcf().autofmt_xdate()
plt.show()
In your situation, the easiest would be to manually create labels and spacing, and apply that using ax.xaxis.set_major_formatter.
Here's a possible solution:
Since no sample data was provided, I tried to mimic the structure of your dataset in a dataframe with some random numbers.
The setup:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
# A dataframe with random numbers ro run tests on
np.random.seed(123456)
rows = 100
df = pd.DataFrame(np.random.randint(-10,10,size=(rows, 1)), columns=['error'])
datelist = pd.date_range(pd.datetime(2017, 1, 1).strftime('%Y-%m-%d'), periods=rows).tolist()
df['dates'] = datelist
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
test_df = df.copy(deep = True)
# Plot of data that mimics the structure of your dataset
ax = test_df[(test_df.index.year ==2017) ]['error'].plot(kind="bar")
ax.figure.autofmt_xdate()
plt.figure(figsize=(15,8))
A possible solution:
test_df = df.copy(deep = True)
ax = test_df[(test_df.index.year ==2017) ]['error'].plot(kind="bar")
plt.figure(figsize=(15,8))
# Make a list of empty myLabels
myLabels = ['']*len(test_df.index)
# Set labels on every 20th element in myLabels
myLabels[::20] = [item.strftime('%Y - %m') for item in test_df.index[::20]]
ax.xaxis.set_major_formatter(ticker.FixedFormatter(myLabels))
plt.gcf().autofmt_xdate()
# Tilt the labels
plt.setp(ax.get_xticklabels(), rotation=30, fontsize=10)
plt.show()
You can easily change the formatting of labels by checking strftime.org

Categories