Weird Time-Series Graph Using Pycaret and plotly - python

I am trying to visualize Air Quality Data as time-series charts using pycaret and plotly dash python libraries , but i am getting very weird graphs, below is my code:
import pandas as pd
import plotly.express as px
data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
#data.set_index('Date', inplace=True)
# combine store and item column as time_series
data['OBJECTID'] = ['Location_' + str(i) for i in data['OBJECTID']]
#data['AQI_Bins_AI'] = ['Bin_' + str(i) for i in data['AQI_Bins_AI']]
data['time_series'] = data[['OBJECTID']].apply(lambda x: '_'.join(x), axis=1)
data.drop(['OBJECTID'], axis=1, inplace=True)
# extract features from date
data['month'] = [i.month for i in data['Date']]
data['year'] = [i.year for i in data['Date']]
data['day_of_week'] = [i.dayofweek for i in data['Date']]
data['day_of_year'] = [i.dayofyear for i in data['Date']]
data.head(4000)
data['time_series'].nunique()
for i in data['time_series'].unique():
subset = data[data['time_series'] == i]
subset['moving_average'] = subset['CO'].rolling(window = 30).mean()
fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
fig.show()
require needful help in this regard,
here is my sample data Google Drive Link

data has not been provided in a usable way. Sought out publicly available similar data. found: https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
using this data, with a couple of cleanups of your code, no issues with plots. I suspect your data has one of these issues
date is not datetime64[ns] in your data frame
date is not sorted, leading to lines being drawn in way you have noted
by refactoring way moving average is calculated, you can use animation instead of lots of separate figures
get some data
import kaggle.cli
import sys, math
import pandas as pd
from pathlib import Path
from zipfile import ZipFile
import plotly.express as px
# download data set
# https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
sys.argv = [
sys.argv[0]
] + "datasets download rohanrao/air-quality-data-in-india".split(
" "
)
kaggle.cli.main()
zfile = ZipFile("air-quality-data-in-india.zip")
print([f.filename for f in zfile.infolist()])
plot using code from question
import pandas as pd
import plotly.express as px
from pathlib import Path
from distutils.version import StrictVersion
# data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
# use kaggle data
# dfs = {f.filename:pd.read_csv(zfile.open(f)) for f in zfile.infolist() if f.filename in ['station_day.csv',"stations.csv"]}
# data = pd.merge(dfs['station_day.csv'],dfs["stations.csv"], on="StationId")
# data['Date'] = pd.to_datetime(data['Date'])
# # kaggle data is different from question, make it compatible with questions data
# data = data.assign(OBJECTID=lambda d: d["StationId"])
# sample data from google drive link
data2 = pd.read_csv(Path.home().joinpath("Downloads").joinpath("AQI.csv"))
data2["Date"] = pd.to_datetime(data2["Date"])
data = data2
# as per very first commment - it's important data is ordered !
data = data.sort_values(["Date","OBJECTID"])
data['time_series'] = "Location_" + data["OBJECTID"].astype(str)
# clean up data, remove rows where there is no CO value
data = data.dropna(subset=["CO"])
# can do moving average in one step (can also be used by animation)
if StrictVersion(pd.__version__) < StrictVersion("1.3.0"):
data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean().to_frame()["CO"].values
else:
data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean()["CO"]
# just first two for purpose of demonstration
for i in data['time_series'].unique()[0:3]:
subset = data.loc[data['time_series'] == i]
fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
fig.show()
can use animation
px.line(
data,
x="Date",
y=["CO", "moving_average"],
animation_frame="time_series",
template="plotly_dark",
).update_layout(yaxis={"range":[data["CO"].min(), data["CO"].quantile(.97)]})

Related

Programming a prediction model, code runs but doesnt give output

My code runs properly but it will not provide output as it should. I am not sure where the issue is occurring. Could someone help me correct it? Do you need the CSV too?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv("/content/drive/MyDrive/replicates/Replicate 3 Gilts just measures.csv")
df.info()
df.head()
# removing the irrelevant columns
cols_to_drop = ["animal"]
df = df.drop(columns=cols_to_drop,axis=1)
# first five rows of data frame after removing columns
df.head()
deep_df = df.copy(deep = True)
numerical_columns = [col for col in df.columns if (df[col].dtype=='int64' or
df[col].dtype=='float64')]
df[numerical_columns].describe().loc[['min','max', 'mean','50%'],:]
df[df['i1000.0'] == df['i1000.0'].min()]
This is where the issue occurs
i1000_bucket = df.groupby(pd.cut(df["i1000.0"],bins=[10,20,30,40,50,60,70,80,90,100]))
number_bucket = df.groupby(pd.cut(df["i1000.0"],bins=[10,20,30,40,50,60,70,80,90,100]))
i1000_bucket = ((i1000_bucket.sum()["i1000.0"] / i1000_bucket.size())*100 , 2)
number_bucket = round((number_bucket.sum()["i1000.0"] / number_bucket.size())*100 , 2)
The graph appears but nothing actually plots
x = [str(i)+"-"+str(i+10) for i in range(10,91,10)]
plt.plot(x,number_bucket.values)
plt.xlabel("i1000.0")
plt.ylabel("p1000.0")
plt.title("1000.0 comparisons")

How to plot multiple lines with HvPlot Python & have multiple interactive widgets?

I am making an interactive graph in python and this is my current code:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas
import panel as pn
pn.extension('tabulator')
import holoviews as hv
hv.extension('bokeh')
df = pd.read_csv("/file/path/")
idf = df.interactive()
# Retrieving all unique underlyings
allUnderlyings = list(set(df.Underlying))
# Setting interactive widgets - user can select what they want displayed
date = pn.widgets.Select(options = ["01/31/22", "02/28/22", "03/31/22", "04/29/22"],
name = "Date")
underlying = pn.widgets.AutocompleteInput(name = "Underlying",
options = allUnderlyings)
bank = pn.widgets.Select(name = "Banks",
options = banks)
ipipeline = (
idf[
(idf.Underlying == underlying) &
(idf["Snap Date"] == date)
((idf["Instrument Type"] == "Spot Price") |
(idf["Instrument Type"] == "Forward"))
]
)
ipipeline.hvplot(x = "Tenor",
y = ["Consensus", bank.value])
I am trying to plot 2 lines in my final graph - currently, the user cannot select which bank they would like to display - this has to be changed by interacting with the bank variable separately. I believe that this is because I am not using the bank variable in the ipipeline. How can I approach this without adding an additional row filter to ipipeline? The data I am trying to pull is in the same row but under a different column (that corresponds with the bank name).

How to use different colour to plot in folium map?

So i have many csv files each one of them has three columns.
latitude
longitude
distance
for example:
car1.csv
lat long total_dist
23.33 73.32. 0
23.45. 73.34. 10
23.64. 73.53. 16 ---> #cumulative sum as car1 travels
#similarly there it for car2,car3,car4 etc
so i concatenated these csv file into one data frame to plot map
import pandas as pd
import folium
from pathlib import Path
import glob
path = r'C:/Users/Desktop/Sample/car_new' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
car_locations = pd.concat(li, axis=0, ignore_index=True)
car_locations = car_locations[["Latitude", "Longitude",]]
map = folium.Map(location=[car_locations.Latitude.mean(), car_locations.Longitude.mean()], zoom_start=14, control_scale=True, tiles='Stamen Terrain')
folium.PolyLine(car_locations, color="red", weight=2.5, opacity=0.5).add_to(map)
map
i want set conditions like this
if(total_dist > 120):
#plot green line on map
else:
#plot red line
This is how it should look i have posted a link to the image.
https://imgur.com/a/07rxQ9l
# This script will plot multiple lines
# One line per file, each line in a colour specified in the list.
from itertools import count
import pandas as pd
import folium
from pathlib import Path
import glob
#Imported OS module to set path dynamically.
# as seen below.
import os
# Dynamically set project script, this will allow you to
# run the script in any directory without manually
# typing the path
project_dir = os.path.realpath('') + '/'
# This is not necessary, but since this was in your script
# I left it in here.
path = project_dir
all_files = glob.glob(path + "/*.csv")
li = []
# List of colors, and a counter.
# this is an easy less complicated way to do it, but not the only way.
colors = ['red', 'blue']
cl_count = 0
# You need to only create the map once
# If this is in the loop, it will create a new map and overwrite
# the map when you plot the second car.
map = folium.Map(zoom_start=14, control_scale=True, tiles='Stamen Terrain')
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
car_locations = pd.concat(li, axis=0, ignore_index=True)
car_locations = car_locations[["Latitude", "Longitude",]]
folium.PolyLine(car_locations, color=colors[cl_count], weight=3.0, opacity=1).add_to(map)
# You need to clear the list at the end of the loop
# Since you want the coordinates from each file to plot a unique line.
li = []
cl_count += 1
map

Display summary statistics in barplot using ggplot/plotnine

In the following simplified example, I wish to display the sum of each stacked barplot (3 for A and 7 for B), yet my code displays all the values, not the summary statistics. What am I doing wrong? Thank you in advance.
import io
import pandas as pd
import plotnine as p9
data_string = """V1,V2,value
A,a,1
A,b,2
B,a,3
B,b,4"""
data = io.StringIO(data_string)
df = pd.read_csv(data, sep=",")
p9.ggplot(df, p9.aes(x='V1', y='value', fill = 'V2')) + \
p9.geom_bar(stat = 'sum') + \
p9.stat_summary(p9.aes(label ='stat(y)'), fun_y = sum, geom = "text")
The issue is the grouping of your data. As you have a global fill aesthetic your data gets grouped by categories of V2. Hence stat_summary computes the sum per group of V2. To solve this issue make fill a local aesthetic of geom_bar or geom_col.
import io
import pandas as pd
import plotnine as p9
data_string = """V1,V2,value
A,a,1
A,b,2
B,a,3
B,b,4"""
data = io.StringIO(data_string)
df = pd.read_csv(data, sep=",")
p9.ggplot(df, p9.aes(x='V1', y='value')) + \
p9.geom_col(p9.aes(fill = 'V2')) + \
p9.stat_summary(p9.aes(label ='stat(y)'), fun_y = sum, geom = "text")
Another option would be to override the global grouping by setting group=1 in stat_summary:
p9.stat_summary(p9.aes(label ='stat(y)', group = 1), fun_y = sum, geom = "text")

how to display graph with Bokeh in django for stocks

I m using linear regression to predict the closing price of a stock on the current day. This works fine.
I m using Django.
I need to add graphs(time-series and a candlestick). after searching I found that Bokeh is best for what I want to achieve.
Question:
I want to add time-series and candlestick graph in my Django project.
Code
This is how I m predicting stocks closing price on the current day.
stockprediction.py
def get_stock_data(name):
try:
if model_check(name) == False:
data_path = os.getcwd()+"\\StockPrediction\\data\\HISTORICAL_DATA\\"
df = pd.read_csv(data_path + name + '_data.csv')
df.fillna(df.mean(), inplace=True)
X = df.iloc[:, [1, 2, 3]]
Y = df.iloc[:, [4]]
reg = linear_model.LinearRegression()
reg.fit(X,Y)
y_today = reg.predict([get_nse_data(name)])
model_path = os.getcwd() + "\\StockPrediction\\data\\saved_data\\"
file = model_path + name + ".pkl"
joblib.dump(reg, file)
return y_today[0][0]
else:
model_path = os.getcwd()+"\\StockPrediction\\data\\saved_data\\"
file = model_path + name+".pkl"
model = joblib.load(file)
y_today = model.predict([get_nse_data(name)])
return y_today
except:
return ("Error")
def get_nse_data(name):
data = nse.get_quote(name)
current = [data['open'], data['dayHigh'], data['dayLow']]
return current
Bonus Question:
I need graphs which are best for showing stocks price like candlestick and time-series(can you suggest more.)
Help!
If you want to implement Bokeh, you can set up everything using the steps in the documentation (https://docs.bokeh.org/en/latest/docs/user_guide/quickstart.html#userguide-quickstart) and that will generate your .html file which you can include in your templates folder.
However, I find libraries like chart.js much more handy and customizable. They can be implemented in django fairly easily. Here's a link to a very good tutorial that helped me a lot:
https://youtu.be/B4Vmm3yZPgc
I've found holoviews to be really nice for this kind of stuff - in your case, you want to work with RangeToolLink in conjunction with hv.Curve and a pandas dataframe to make a typical stock plot with a range tool on the bottom.
Here's a simple example, stolen from the holoviews website:
import bokeh
bokeh.sampledata.download() # only needs to run once
import pandas as pd
import holoviews as hv
from bokeh.sampledata.stocks import AAPL
from holoviews.plotting.links import RangeToolLink
from holoviews import opts
hv.extension('bokeh')
# Make dataframe from stock data
aapl_df = pd.DataFrame(AAPL['close'], columns=['close'], index=pd.to_datetime(AAPL['date']))
aapl_df.index.name = 'Date'
# Create stock curve
aapl_curve = hv.Curve(aapl_df, 'Date', ('close', 'Price ($)'))
# Labels and layout
tgt = aapl_curve.relabel('AAPL close price').opts(width=800, labelled=['y'], toolbar='disable')
src = aapl_curve.opts(width=800, height=100, yaxis=None, default_tools=[])
RangeToolLink(src, tgt)
# Merge rangetool
layout = (tgt + src).cols(1)
layout.opts(opts.Layout(shared_axes=False, merge_tools=False))
Here's what you should see:
An even simpler example here uses candlesticks in bokeh:
from math import pi
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.sampledata.stocks import MSFT
df = pd.DataFrame(AAPL)[:50]
df["date"] = pd.to_datetime(df["date"])
inc = df.close > df.open
dec = df.open > df.close
w = 12*60*60*1000 # half day in ms
TOOLS = "pan,wheel_zoom,box_zoom,reset,save"
p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=1000, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3
p.segment(df.date, df.high, df.date, df.low, color="black")
p.vbar(df.date[inc], w, df.open[inc], df.close[inc], fill_color="#D5E1DD", line_color="black")
p.vbar(df.date[dec], w, df.open[dec], df.close[dec], fill_color="#F2583E", line_color="black")
show(p)
Result:
This all works seamlessly in a Jupyter notebook, so it should be easy enough for you - you just need to get your predictions into a Pandas dataframe!

Categories