Subplot from multi excels - python

I am dealing with 20 excels. For each excel, I do the same subsetting and plotting. I can get 20 separate figures now using for loop. But how can I put these figures in one subplot (5*4)? Check some posts but cannot get the answer.
for files in allfiles:
#if re.search(r".*.xlsx", files):
df = pd.read_excel("D:\Brown research\Task2 site selection\All_excel\{0}".format(files))
newdf = df[(df.slope != 0) & (df.AI >= 0.8) & (df.reach_len > 5000)]
Q1=df['slope'].quantile(0.25)
Q3=df['slope'].quantile(0.75)
IQR=Q3-Q1
Upper_Whisker = Q3+3*IQR
newdf = newdf[newdf['slope']<Upper_Whisker]
x = newdf['slope']
y = newdf['AI']
nbins = 20
plt.figure()
plt.hist2d(x,y,nbins,cmap=plt.cm.coolwarm, cmin=1)
plt.colorbar()
plt.title(files.split('_')[0],x=0.5,y=0.9)

Create your fig and axes ahead of time and use matplotlib's object-oriented interface:
from pathlib import Path
from matplotlib import pyplot
import pandas
basedir = Path(r"D:\Brown research\Task2 site selection\All_excel")
fig, axes = pyplot.subplots(5, 4, figsize=(10, 10))
for xlfile, ax in zip(basedir.glob("*.xlsx"), axes.flat):
df = pandas.read_excel(xlfile)
Q1=df['slope'].quantile(0.25)
Q3=df['slope'].quantile(0.75)
IQR=Q3-Q1
Upper_Whisker = Q3+3*IQR
newdf = df.loc[lambda df:
(df["slope"] != 0) &
(df["slope"] < Upper_Whisker) &
(df["AI"] >= 0.8) &
(df["reach_len"] > 5000)
]
x = newdf['slope']
y = newdf['AI']
nbins = 20
hist = ax.hist2d(x,y,nbins,cmap=plt.cm.coolwarm, cmin=1)
fig.colorbar(hist)
ax.set_title(xlfile.stem.split('_')[0], x=0.5, y=0.9)
This assumes that you have exactly 20 files in your directory.
Better yet, I would concat everything into a single dataframe would use seaborn to build up the plot dynamically:
from pathlib import Path
from matplotlib import pyplot
import pandas
import seaborn
def get_upper_whisker(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
return Q3 + (3 * IQR)
basedir = Path(r"D:\Brown research\Task2 site selection\All_excel")
data = pandas.concat([
pandas.read_excel(xlfile).assign(file=xlfile.stem)
for xlfile in basefir.glob("*.xlsx")
])
fig = (
data.groupby("file")
.apply(lambda g:
g.loc[g["slope"] < get_upper_whisker(g, "slope")]
)
.loc[lambda df:
(df["slope"] != 0) &
(df["AI"] >= 0.8) &
(df["reach_len"] > 5000)
]
.pipe(seaborn.FacetGrid, col="file", col_wrap=4)
)
fig.map(pyplot.hist2d, "slope", "AI", bins=20, cmap=pyplot.cm.coolwarm, cmin=1)

Thank you for the two solutions from #Paul H. For solution 1, just need to revise a little bit, e.g., "fig.colorbar(hist[3],ax=ax)". hist[3] is the "image".

Related

How to create time_mask with two conditions in Python

I need to plot data that starts and end at a certain time, next to this I need to exclude a period in the weekend in that time period.
How can I create a time_mask of my data that has two rules?
I already created a code for the "Start" and "End" period, but I am not able to add the rule for excluding the "Weekend period".
#create a time_mask
start_date = '2022-06-30 15:26:00'
end_date = '2022-07-11 15:30:00'
weekend_end = '2022-07-08 14:30:00'
weekend_start = '2022-07-11 09:50:00'
time_mask = (df['Time'] > start_date) & (df['Time'] <= end_date)
# use only this part of the dataframe as training data
df1_train = df1.loc[time_mask]
I tried to exclude the "Weekend period" with the code below, but this is not working...
time_mask = ((df['Time'] > start_date) & (df['Time'] <= end_date) & ((df['Time'] < weekend_start) or (df['Time'] > weekend_end)))
I already solved the problem for one part. But now in my plot the period is not excluded:
Plot
Plot in operating hours
UPDATE 22-08-22
#%% Plot data
fig, ax = plt.subplots()
ax.plot(df['Time'], df1[Temp])
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y-%m-%d %H:%M:%S'))
fig.autofmt_xdate()
plt.show()
#%% Plot the data without empty values
N = len(df['Time'])
ind = np.arange(N)
def format_date(x, pos=None):
thisind = np.clip(int(x + 0.5), 0, N - 1)
return df['Time'][thisind].strftime('%Y-%m-%d %H:%M:%S')
fig, ax = plt.subplots()
ax.plot(ind, df[Temp])
ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
ax.set_title("Without empty values")
fig.autofmt_xdate()
plt.show()
Update 22-08-22
use '|' instead of or.
And in my opinion, you confused weekend_end with weekend_start, since the start is a later date, and the end, on the contrary, is early.
After filtering by condition:
(df['Time'] > start_date) & (df['Time'] <= end_date)
the data is again filtered by time greater than weekend_start:
(df['Time'] > weekend_start)
or time less than weekend_end:
(df['Time'] < weekend_end)
that is, the period from 2022-07-08 14:30:00 to 2022-07-11 09:50:00 is excluded.
Now about drawing. The fact is that the axis with dates and times is continuous. Even if there is no data in a certain period. On the left is a picture that does not remove this gap, on the right, the 'format_date' function is used to exclude this gap.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates
import matplotlib.ticker as ticker
df = pd.read_csv('Data.csv', sep=',', header=0)
start_date = '2022-06-30 15:26:00'
end_date = '2022-07-11 15:30:00'
weekend_end = '2022-07-08 14:30:00'
weekend_start = '2022-07-11 09:50:00'
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
time_mask = ((df['Timestamp'] > start_date) & (df['Timestamp'] <= end_date) & (
(df['Timestamp'] > weekend_start) | (df['Timestamp'] < weekend_end)))
df1 = df[time_mask].copy()
df1 = df1.set_index('Timestamp')
fig, axes = plt.subplots(ncols=2)
ax = axes[0]
ax.plot(df1.index, df1['Data'])
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y-%m-%d %H:%M:%S'))
ax.set_title("Default")
fig.autofmt_xdate()
N = len(df1['Data'])
ind = np.arange(N)
def format_date(x, pos=None):
thisind = np.clip(int(x + 0.5), 0, N - 1)
return df1.index[thisind].strftime('%Y-%m-%d %H:%M:%S')
ax = axes[1]
ax.plot(ind, df1['Data'])
ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
ax.set_title("Without empty values")
fig.autofmt_xdate()
plt.show()
Note that the 'Timestamp' column is converted to an index.
df1 = df1.set_index('Timestamp')
Below is the drawing code with a simple moving average. It's hard for me to calculate ema. You can use a library like TA-Lib.
df1['sma'] = df1['Data'].rolling(window=33).mean()
N = len(df1.index)
ind = np.arange(N)
def format_date(x, pos=None):
thisind = np.clip(int(x + 0.5), 0, N - 1)
return df1.index[thisind].strftime('%Y-%m-%d %H:%M:%S')
fig, ax = plt.subplots()
ax.plot(ind, df1['Data'])
ax.plot(ind, df1['sma'])
ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
fig.autofmt_xdate()
plt.show()
also it seems correct to me, to convert strings to date time format, make them like in file:
also it seems correct to me, to convert strings to date time format, make them like in file:
start_date = pd.to_datetime('2022-06-30T15:26:00+02:00', errors='coerce')
end_date = pd.to_datetime('2022-07-11T15:30:00+02:00', errors='coerce')
weekend_end = pd.to_datetime('2022-07-08T14:30:00+02:00', errors='coerce')
weekend_start = pd.to_datetime('2022-07-11T09:50:00+02:00', errors='coerce')
Update 12/09/2022.
made it more convenient to draw without gaps. Created a column from an index by converting the data to strings. In the previous version, the same principle, but here everything is done at once without a function. Also applied MaxNLocator is how many divisions to display.
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
df = pd.read_csv('Data.csv', sep=',', header=0)
start_date = '2022-06-30 15:26:00'
end_date = '2022-07-11 15:30:00'
weekend_end = '2022-07-08 14:30:00'
weekend_start = '2022-07-11 09:50:00'
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
time_mask = ((df['Timestamp'] > start_date) & (df['Timestamp'] <= end_date) & (
(df['Timestamp'] > weekend_start) | (df['Timestamp'] < weekend_end)))
df1 = df[time_mask].copy()
df1 = df1.set_index('Timestamp')
df1['string'] = df1.index.astype(str)
df1['sma'] = df1['Data'].rolling(window=33).mean()
fig, ax = plt.subplots()
ax.plot(df1['string'], df1['Data'])
ax.plot(df1['string'], df1['sma'])
ax.xaxis.set_major_locator(MaxNLocator(nbins=5))
fig.autofmt_xdate()
plt.show()

python percentage label with groupby and barchart

percentage_df = df.groupby(['customer service calls', 'churn']).size().groupby(level=0).apply(
lambda x: np.round(x * 100 / len(df), 2))
ax = percentage_df.unstack().plot(kind='bar', stacked=True, figsize=(10,10))
for c in ax.containers:
ax.bar_label(c, label_type='center',color='black')
outcome :
Hi, I am new to python, and the below image is what I am planning to code.
the bar should be the same, but percentage indicates the percentage of the certain group
add % sign
only shows the percentage of orange parts
#1 is the main problem I am having with.
Would you please help me on this problem?
Many thanks in advance.
Is that what you want ? (I used bar containers to edit the bars data and a little of list comprehension) :
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
# init df:
df = pd.DataFrame({'calls':[np.random.randint(0,10) for _ in range(40)],
'churn': [random.choice([True, False]) for _ in range(40)]})
# preprocess the dataframe:
# true and false values calculation
true_perc = df.groupby('calls')['churn'].mean()*100
true_count = df.groupby('calls')['churn'].sum()
false_count = df.groupby('calls')['churn'].count() - df.groupby('calls')['churn'].sum()
# final df
df = pd.concat([false_count, true_count], axis=1)
df.columns = ['false_count', 'true_count']
# create plot
ax = df.plot( kind='bar', stacked=True, figsize=(10,10))
# getting the containers values
container_t = ax.containers[1]
# create custom labels
labels = [f'{round(val, 2)} %' if (val != 0) else '' for val in true_perc]
# annotate with the previous custom labels
ax.bar_label(container_t, labels=labels, label_type='center', fontsize=7)
# pad the spacing between the number and the edge of the figure
ax.margins(y=0.1)
# show plot
plt.show()
output:

Candlesticks in matplotlib

I'm trying to make a cryptoscanner but I'm struggling a bit. The code right now can loop through the different coins in symbols.csv and print plots for all of them. The plots include close price, SMA and bollinger bands. Now I really want the close price to be candlesticks and not a line. I've found that there are other plots like mpf to make candlesticks. The problem is that I don't know how to make the bollinger bands work with the mpf plots and I don't know how to make the candlesticks work with matplotlib. Can someone help me making candlesticks in matplotlib orrr make the bollingerbands in the mpf plots.
Thanks in advance!
The graph looks like this right now
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
with open('symbols.csv') as f:
symbols = f.read().splitlines()
for symbol in symbols:
df = yf.download(symbol, start='2020-01-01')
# df = yf.download(symbol, period = '22h', interval = '15m')
print(df)
# df = yf.download('ADA-USD', start='2021-01-01')
df['SMA'] = df.Close.rolling(window=20).mean()
df['stddev'] = df.Close.rolling(window=20).std()
df['Upper'] = df.SMA + 2* df.stddev
df['Lower'] = df.SMA - 2* df.stddev
df['Buy_Signal'] = np.where(df.Lower > df.Close, True, False)
df['Sell_Signal'] = np.where(df.Upper < df.Close, True, False)
buys = []
sells = []
open_pos = False
for i in range(len(df)):
if df.Lower[i] > df.Close[i]:
if open_pos == False:
buys.append(i)
open_pos = True
elif df.Upper[i] < df.Close[i]:
if open_pos:
sells.append(i)
open_pos = False
plt.figure(figsize=(12, 6))
plt.scatter(df.iloc[buys].index, df.iloc[buys].Close, marker = '^', color ='g')
plt.scatter(df.iloc[sells].index, df.iloc[sells].Close, marker = '^', color ='r')
plt.plot(df[['Close', 'SMA', 'Upper', 'Lower']])
plt.fill_between(df.index, df.Upper, df.Lower, color='grey', alpha=0.3)
plt.legend(['Close', 'SMA', 'Upper', 'Lower'])
plt.show()
merged = pd.concat([df.iloc[buys].Close, df.iloc[sells].Close], axis=1)
merged.columns = ['Buys', 'Sells']
print(merged)
totalprofit = merged.shift(-1).Sells - merged.Buys
print(totalprofit)
relprofits = (merged.shift(-1).Sells - merged.Buys) / merged.Buys
print(relprofits.mean())
The links in the comments provide a wealth of examples. Since you want to graph candlesticks, Bollinger Bands, and SMAs in mpf, I have modified the additional plot examples from the previous examples to suit your needs. The graphs were created from data obtained in stocks instead of currencies.
import yfinance as yf
import pandas as pd
import mplfinance as mpf
df = yf.download("AAPL", start="2020-01-01")
df['SMA'] = df.Close.rolling(window=20).mean()
df['stddev'] = df.Close.rolling(window=20).std()
df['Upper'] = df.SMA + 2* df.stddev
df['Lower'] = df.SMA - 2* df.stddev
df['Buy_Signal'] = np.where(df.Lower > df.Close, True, False)
df['Sell_Signal'] = np.where(df.Upper < df.Close, True, False)
tcdf = df[['Lower','Upper','SMA']]
apd = mpf.make_addplot(tcdf)
mpf.plot(df, figratio=(8,4), type='candle', addplot=apd, volume=False, style='yahoo')

How to grid plot 2D categorical data

I hava data that looks like:
Name X Y
A HIGH MID
B LOW LOW
C MID LOW
D HIGH MID
How to plot this data in a 2-D diagram with a 3x3 grid adding a random variation to place each data point including its name with enough spacing between each other.
So it should look somewhat like that:
The following i tried, but i dont know how to plot the values not exactly on the grid, but in between, so they do nbot overlap.
import pandas as pd
import matplotlib.pyplot as plt
### Mock Data ###
data = """A0,LOW,LOW
A,MID,MID
B,LOW,MID
C,MID,HIGH
D,LOW,MID
E,HIGH,HIGH"""
df = pd.DataFrame([x.split(',') for x in data.split('\n')])
df.columns = ['name','X','Y']
### Plotting ###
fig,axs = plt.subplots()
axs.scatter(df.X,df.Y,label=df.name)
axs.set_xlabel('X')
axs.set_ylabel('Y')
for i,p in enumerate(df.name):
axs.annotate(p, (df.X[i],df.Y[i]))
axs.grid()
axs.set_axisbelow(True)
fig.tight_layout()
plt.show()
resulting:
You can control directly the positions and change the labels on the axis. There are a few problems with your drawing because you are not taking into account some issue such as "what label will you have if you have more than one point at the same location?".
In any case here is a possible solution:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
### Mock Data ###
data = """A0,LOW,LOW
A,MID,MID
B,LOW,MID
C,MID,HIGH
D,LOW,MID
E,HIGH,HIGH"""
df = pd.DataFrame([x.split(',') for x in data.split('\n')])
df.columns = ['name','X','Y']
pos = [0, 1, 2]
lbls = ["LOW", "MID", "HIGH"]
trans = {lbls[i]:pos[i] for i in range(len(pos))}
mat = np.zeros((3, 3), dtype="U10") # This is limited to 10 characters
xxs = []
yys = []
offset = 0.05
for i in range(df.shape[0]):
xc, yc = trans[df.X[i]], trans[df.Y[i]]
if mat[xc, yc]=="":
mat[xc, yc] = df.name[i]
else:
mat[xc, yc] = mat[xc, yc] + ";" + df.name[i]
xxs.append(xc)
yys.append(yc)
fig,axs = plt.subplots()
axs.scatter(xxs, yys)
for i in range(df.shape[0]):
name = mat[xxs[i], yys[i]]
axs.text(xxs[i]+offset, yys[i]+offset, name)
axs.set_xticks(pos)
axs.set_xticklabels(lbls)
axs.set_yticks(pos)
axs.set_yticklabels(lbls)
for i in pos:
axs.axhline(pos[i]-0.5, color="black")
axs.axvline(pos[i]-0.5, color="black")
axs.set_xlim(-0.5, 2.5)
axs.set_ylim(-0.5, 2.5)
plt.show()
This result in the following image:

Set maximum of datapoints per plot

Im using the following code:
import matplotlib.pyplot as pyplot
import pandas as pandas
from datetime import datetime
dataset = pandas.read_csv("HugLog_17.01.11.csv", sep=",", header=0)
print('filter data for SrcAddr')
dataset_filtered = dataset[dataset['SrcAddr']=='0x1FD3']
print('get Values')
varY = dataset_filtered.Battery_Millivolt.values
varX = dataset_filtered.Timestamp.values
print('Convert the date-strings in date-objects.')
dates_list = [datetime.strptime(date, '%y-%m-%d %H:%M:%S') for date in varX]
fig = pyplot.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_xlabel('Time')
ax1.set_ylabel('Millivolt')
ax1.bar(dates_list, varY)
pyplot.locator_params(axis='x',nbins=10)
pyplot.show()
The problem i have is, its a large datacollection with 180k datapoints.
And pyplot displays all points an the graph which makes it slow and the bars overlap. Is there a way to set a maximum-limit on how much datapoints a displayed at a "view".
What i mean by that is, that as soon as the graph is render ther are only 50 datapoints and when i zoomm in i only get a maximum of 50 datapoints again.
Resampling can be done with the resample function from pandas.
Note that the resample syntax has changed between version 0.17 and 0.19 of pandas. The example below uses the old style. See e.g. this tutorial for the new style.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# generate some data for every second over a whole day
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*100
# resample the data, taking the mean over 1 hours ("H")
t = "H" # for hours, try "T" for minutes as well
width=1./24 #matplotlib default uses a width of 1 day per bar
# try width=1./(24*60) for minutes
df_resampled = pd.DataFrame()
df_resampled['data'] = df.data.resample(t, how="mean")
fig, ax = plt.subplots()
#ax.bar(df.index, df['data'], width=1./(24*60*60)) # original data, takes too long to plot
ax.bar(df_resampled.index, df_resampled['data'], width=width)
ax.xaxis_date()
plt.show()
Automatic adaption of the resampling when zooming would indeed require some manual work. There is a resampling example on the matplotlib event handling page, which does not work out of the box but could be adapted accordingly.
This is how it would look like:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates
class Sampler():
def __init__(self,df):
self.df = df
def resample(self, limits):
print limits
dt = limits[1] - limits[0]
if (type(dt) != pd.tslib.Timedelta) and (type(dt) != datetime.timedelta):
dt = datetime.timedelta(days=dt)
print dt
#see #http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
if dt > datetime.timedelta(hours=5):
t = "H"; width=1./24
elif dt > datetime.timedelta(minutes=60):
t = "15T"; width=15./(24.*60)
elif dt > datetime.timedelta(minutes=5):
t = "T"; width=1./(24.*60)
elif dt > datetime.timedelta(seconds=60):
t = "15S"; width=15./(24.*60*60)
else:
#dt < datetime.timedelta(seconds=60):
t = "S"; width=1./(24.*60*60)
self.resampled = pd.DataFrame()
self.resampled['data'] = self.df.data.resample(t, how="mean")
print t, len(self.resampled['data'])
print "indextype", type(self.resampled.index[0])
print "limitstype", type(limits[1])
if type(limits[1]) == float or type(limits[1]) == np.float64 :
dlowlimit = matplotlib.dates.num2date(limits[0])
duplimit = matplotlib.dates.num2date(limits[1])
print type(duplimit), duplimit
self.resampled = self.resampled.loc[self.resampled.index <= duplimit]
self.resampled = self.resampled.loc[self.resampled.index >= dlowlimit]
else:
self.resampled = self.resampled.loc[self.resampled.index <= limits[1]]
self.resampled = self.resampled.loc[self.resampled.index >= limits[0]]
return self.resampled.index,self.resampled['data'],width
def update(self, ax):
print "update"
lims = ax.viewLim
start, stop = lims.intervalx
ax.clear()
x,y,width = self.resample([start, stop])
ax.bar(x,y, width=width)
ax.set_xlim([start, stop])
ax.callbacks.connect('xlim_changed', self.update)
ax.figure.canvas.draw()
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*500
sampler = Sampler(df)
x,y,width = sampler.resample( [df.index[0],df.index[-1] ] )
fig, ax = plt.subplots()
ax.bar(x,y, width=width)
ax.xaxis_date()
# connect to limits changes
ax.callbacks.connect('xlim_changed', sampler.update)
plt.show()
One thing you can do is plot a random subset of the data by using the sample method on your pandas DataFrame. Use the frac argument to determine the fraction of points you want to use. It ranges from 0 to 1.
After you get your dataset_filtered DataFrame, take a sample of it like this
dataset_filtered_sample = dataset_filtered.sample(frac=.001)

Categories