Python .csv labeling done properly via jupyter - python

I'm not sure if my graphs are done properly, what will happen if I'd want to go with upside down. I'd like also to print and generate file as .pdf. But I'm not quite sure how to accomplish that task, please give me some advice if you have any. I'd appreciate that, all best.
Changing variables countlessly
import numpy as np
np.__version__
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from operator import itemgetter
sns.set(style="darkgrid")
# t 1
m1 = np.array([[1,2,2],[-4,3,8],[-1,0,1]])
m2 = np.array([[1,4],[-2,2],[3,-6]])
print(m1.dot(m2));
# t 2
G = nx.Graph()
G.add_edges_from([
('A','D'),('A','B'),('B','D'),('B','C'),('B','E'),('C','D'),('C','E'),('D','E')
])
nx.draw(G, with_labels=True)
array = nx.betweenness_centrality(G)
array['B']
# t 3
df = pd.read_csv('xxx.csv')
df.set_index('OBJECTID', inplace=True)
df.head(1)
# t 4
sorted = df.groupby('NAME')['PT_ENROLL'].sum().sort_values(ascending=False)
sorted.head(7)
# t 5
df.groupby('NAICS_DESC')['NAME'].count().sort_values(ascending=False)
# t 6
df1 = df['TOT_ENROLL']
df2 = df['POPULATION']
plt.scatter(df1,df2)

# T3
df = pd.read_csv('Hospitals.csv')
df.set_index('OBJECTID', inplace=True)
df.head(5)
# T4
sorted = df.groupby('CITY')['NAME'].count().sort_values(ascending=False)
sorted.head(6)
# T5
sorted = df.groupby('NAME')['Y'].max().sort_values(ascending=False)
sorted.head(5)
# T6
df.groupby('OWNER')['BEDS'].sum().sort_values(ascending=False).plot(kind='bar')

Related

Programming a prediction model, code runs but doesnt give output

My code runs properly but it will not provide output as it should. I am not sure where the issue is occurring. Could someone help me correct it? Do you need the CSV too?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv("/content/drive/MyDrive/replicates/Replicate 3 Gilts just measures.csv")
df.info()
df.head()
# removing the irrelevant columns
cols_to_drop = ["animal"]
df = df.drop(columns=cols_to_drop,axis=1)
# first five rows of data frame after removing columns
df.head()
deep_df = df.copy(deep = True)
numerical_columns = [col for col in df.columns if (df[col].dtype=='int64' or
df[col].dtype=='float64')]
df[numerical_columns].describe().loc[['min','max', 'mean','50%'],:]
df[df['i1000.0'] == df['i1000.0'].min()]
This is where the issue occurs
i1000_bucket = df.groupby(pd.cut(df["i1000.0"],bins=[10,20,30,40,50,60,70,80,90,100]))
number_bucket = df.groupby(pd.cut(df["i1000.0"],bins=[10,20,30,40,50,60,70,80,90,100]))
i1000_bucket = ((i1000_bucket.sum()["i1000.0"] / i1000_bucket.size())*100 , 2)
number_bucket = round((number_bucket.sum()["i1000.0"] / number_bucket.size())*100 , 2)
The graph appears but nothing actually plots
x = [str(i)+"-"+str(i+10) for i in range(10,91,10)]
plt.plot(x,number_bucket.values)
plt.xlabel("i1000.0")
plt.ylabel("p1000.0")
plt.title("1000.0 comparisons")

boxplot structure disappears when pandas contains nan [duplicate]

I am using matplotlib to plot a box figure but there are some missing values (NaN). Then I found it doesn't display the box figure within the columns having NaN values.
Do you know how to solve this problem?
Here are the codes.
import numpy as np
import matplotlib.pyplot as plt
#==============================================================================
# open data
#==============================================================================
filename='C:\\Users\\liren\\OneDrive\\Data\\DATA in the first field-final\\ks.csv'
AllData=np.genfromtxt(filename,delimiter=";",skip_header=0,dtype='str')
TreatmentCode = AllData[1:,0]
RepCode = AllData[1:,1]
KsData= AllData[1:,2:].astype('float')
DepthHeader = AllData[0,2:].astype('float')
TreatmentUnique = np.unique(TreatmentCode)[[3,1,4,2,8,6,9,7,0,5,10],]
nT = TreatmentUnique.size#nT=number of treatments
#nD=number of deepth;nR=numbers of replications;nT=number of treatments;iT=iterms of treatments
nD = 5
nR = 6
KsData_3D = np.zeros((nT,nD,nR))
for iT in range(nT):
Treatment = TreatmentUnique[iT]
TreatmentFilter = TreatmentCode == Treatment
KsData_Filtered = KsData[TreatmentFilter,:]
KsData_3D[iT,:,:] = KsData_Filtered.transpose()iD = 4
fig=plt.figure()
ax = fig.add_subplot(111)
plt.boxplot(KsData_3D[:,iD,:].transpose())
ax.set_xticks(range(1,nT+1))
ax.set_xticklabels(TreatmentUnique)
ax.set_title(DepthHeader[iD])
Here is the final figure and some of the treatments are missing in the box.
You can remove the NaNs from the data first, then plot the filtered data.
To do that, you can first find the NaNs using np.isnan(data), then perform the bitwise inversion of that Boolean array using the ~: bitwise inversion operator. Use that to index the data array, and you filter out the NaNs.
filtered_data = data[~np.isnan(data)]
In a complete example (adapted from here)
Tested in python 3.10, matplotlib 3.5.1, seaborn 0.11.2, numpy 1.21.5, pandas 1.4.2
For 1D data:
import matplotlib.pyplot as plt
import numpy as np
# fake up some data
np.random.seed(2022) # so the same data is created each time
spread = np.random.rand(50) * 100
center = np.ones(25) * 50
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
data = np.concatenate((spread, center, flier_high, flier_low), 0)
# Add a NaN
data[40] = np.NaN
# Filter data using np.isnan
filtered_data = data[~np.isnan(data)]
# basic plot
plt.boxplot(filtered_data)
plt.show()
For 2D data:
For 2D data, you can't simply use the mask above, since then each column of the data array would have a different length. Instead, we can create a list, with each item in the list being the filtered data for each column of the data array.
A list comprehension can do this in one line: [d[m] for d, m in zip(data.T, mask.T)]
import matplotlib.pyplot as plt
import numpy as np
# fake up some data
np.random.seed(2022) # so the same data is created each time
spread = np.random.rand(50) * 100
center = np.ones(25) * 50
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
data = np.concatenate((spread, center, flier_high, flier_low), 0)
data = np.column_stack((data, data * 2., data + 20.))
# Add a NaN
data[30, 0] = np.NaN
data[20, 1] = np.NaN
# Filter data using np.isnan
mask = ~np.isnan(data)
filtered_data = [d[m] for d, m in zip(data.T, mask.T)]
# basic plot
plt.boxplot(filtered_data)
plt.show()
I'll leave it as an exercise to the reader to extend this to 3 or more dimensions, but you get the idea.
Use seaborn, which is a high-level API for matplotlib
seaborn.boxplot filters NaN under the hood
import seaborn as sns
sns.boxplot(data=data)
1D
2D
NaN is also ignored if plotting from df.plot(kind='box') for pandas, which uses matplotlib as the default plotting backend.
import pandas as pd
df = pd.DataFrame(data)
df.plot(kind='box')
1D
2D

Weird Time-Series Graph Using Pycaret and plotly

I am trying to visualize Air Quality Data as time-series charts using pycaret and plotly dash python libraries , but i am getting very weird graphs, below is my code:
import pandas as pd
import plotly.express as px
data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
#data.set_index('Date', inplace=True)
# combine store and item column as time_series
data['OBJECTID'] = ['Location_' + str(i) for i in data['OBJECTID']]
#data['AQI_Bins_AI'] = ['Bin_' + str(i) for i in data['AQI_Bins_AI']]
data['time_series'] = data[['OBJECTID']].apply(lambda x: '_'.join(x), axis=1)
data.drop(['OBJECTID'], axis=1, inplace=True)
# extract features from date
data['month'] = [i.month for i in data['Date']]
data['year'] = [i.year for i in data['Date']]
data['day_of_week'] = [i.dayofweek for i in data['Date']]
data['day_of_year'] = [i.dayofyear for i in data['Date']]
data.head(4000)
data['time_series'].nunique()
for i in data['time_series'].unique():
subset = data[data['time_series'] == i]
subset['moving_average'] = subset['CO'].rolling(window = 30).mean()
fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
fig.show()
require needful help in this regard,
here is my sample data Google Drive Link
data has not been provided in a usable way. Sought out publicly available similar data. found: https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
using this data, with a couple of cleanups of your code, no issues with plots. I suspect your data has one of these issues
date is not datetime64[ns] in your data frame
date is not sorted, leading to lines being drawn in way you have noted
by refactoring way moving average is calculated, you can use animation instead of lots of separate figures
get some data
import kaggle.cli
import sys, math
import pandas as pd
from pathlib import Path
from zipfile import ZipFile
import plotly.express as px
# download data set
# https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
sys.argv = [
sys.argv[0]
] + "datasets download rohanrao/air-quality-data-in-india".split(
" "
)
kaggle.cli.main()
zfile = ZipFile("air-quality-data-in-india.zip")
print([f.filename for f in zfile.infolist()])
plot using code from question
import pandas as pd
import plotly.express as px
from pathlib import Path
from distutils.version import StrictVersion
# data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
# use kaggle data
# dfs = {f.filename:pd.read_csv(zfile.open(f)) for f in zfile.infolist() if f.filename in ['station_day.csv',"stations.csv"]}
# data = pd.merge(dfs['station_day.csv'],dfs["stations.csv"], on="StationId")
# data['Date'] = pd.to_datetime(data['Date'])
# # kaggle data is different from question, make it compatible with questions data
# data = data.assign(OBJECTID=lambda d: d["StationId"])
# sample data from google drive link
data2 = pd.read_csv(Path.home().joinpath("Downloads").joinpath("AQI.csv"))
data2["Date"] = pd.to_datetime(data2["Date"])
data = data2
# as per very first commment - it's important data is ordered !
data = data.sort_values(["Date","OBJECTID"])
data['time_series'] = "Location_" + data["OBJECTID"].astype(str)
# clean up data, remove rows where there is no CO value
data = data.dropna(subset=["CO"])
# can do moving average in one step (can also be used by animation)
if StrictVersion(pd.__version__) < StrictVersion("1.3.0"):
data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean().to_frame()["CO"].values
else:
data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean()["CO"]
# just first two for purpose of demonstration
for i in data['time_series'].unique()[0:3]:
subset = data.loc[data['time_series'] == i]
fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
fig.show()
can use animation
px.line(
data,
x="Date",
y=["CO", "moving_average"],
animation_frame="time_series",
template="plotly_dark",
).update_layout(yaxis={"range":[data["CO"].min(), data["CO"].quantile(.97)]})

Plotting Results from For Iteration

I am new to python and I want to ask how to plot a figure from for loop iteration?
Here is the code!
import numpy as np #numerical python
import matplotlib.pyplot as plt #python plotting
from math import exp #exponential math directory
T_initial = 293
T_reference = range(298,340,2)
R1_initial = 57500
R2_initial = 13300
R3_initial = 18000
R4_initial = 5600
Beta = 4150
Vin = 2.8
for i in T_reference:
R1_refe = R1_initial*exp(Beta*((1/i)-(1/T_initial)))
Rs = (R2_initial/(R2_initial+ R1_refe)) - (R4_initial/(R3_initial+R4_initial))
Vo = Vin*Rs
Vo_round = round(Vo, 3)
print(i,Vo_round)
You can plot the data like this:
for i in T_reference:
R1_refe = R1_initial*exp(Beta*((1/i)-(1/T_initial)))
Rs = (R2_initial/(R2_initial+ R1_refe)) - (R4_initial/(R3_initial+R4_initial))
Vo = Vin*Rs
Vo_round = round(Vo, 3)
plt.scatter(i, Vo_round)
plt.show()
Is this what you were looking for?
Put the values of the items you want to plot into two different arrays using the 'append' method (one for the 'x' axis and one for the 'y' axis).
Then just plot the graph with the matplotlib
It should be something like the below:
is1 = list()
vos = list()
for i in T_reference:
R1_refe = R1_initial*exp(Beta*((1/i)-(1/T_initial)))
Rs = (R2_initial/(R2_initial+ R1_refe)) - (R4_initial/(R3_initial+R4_initial))
Vo = Vin*Rs
Vo_round = round(Vo, 3)
print(i,Vo_round)
is1.append(i)
vos.append(Vo_round)
plt.plot(is1,vos)
Here is a reference for plotting
Two options without a for-loop
Create a function
def v_o(T_reference):
T_initial = 293
R1_initial = 57500
R2_initial = 13300
R3_initial = 18000
R4_initial = 5600
Beta = 4150
Vin = 2.8
R1_refe = R1_initial*exp(Beta*((1/T_reference)-(1/T_initial)))
Rs = (R2_initial/(R2_initial + R1_refe)) - (R4_initial/(R3_initial+R4_initial))
Vo = Vin*Rs
Vo_round = round(Vo, 3)
return Vo_round
Option 1: Use a pandas dataframe
import pandas as pd
import matplotlib.pyplot as plt
# create the dataframe with T_reference
df = pd.DataFrame({'t_ref': [*range(298, 340,2)]})
# Call the function to calculate v_o
df['v_o'] = df.t_ref.apply(v_o)
# plot
df.plot('t_ref', 'v_o', legend=False)
plt.show()
Option 2: use map
T_reference = [*range(298, 340,2)]
v_o = list(map(v_o, T_reference))
plt.plot(T_reference, v_o)
plt.show()
Plot
The plot from both options looks like the following

How to make many plots with multiply groupby pandas?

Sorry, I can't google how to get my aim so I am here.
see some sandbox datatable:
mode X Y
0 1 3 10
1 1 4 11
2 1 3 12
3 1 4 13
4 2 3 14
5 2 4 15
6 2 3 16
7 2 4 17
I created following sandbox code. So here, I want plot with TWO lines corresponding to the two different modes ('mode 1' and 'mode 2'). X-axis should be 3,4. And here I want to get two lines (3,(10+12)/2)--(4,(11+13)/2) for mode 1 with averaged Y and analogical (3,15)--(4,16) for mode 2.
But this code even doesn't work.
#!/usr/bin/python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame([[1,1,1,1,2,2,2,2],[3,4,3,4,3,4,3,4],list(range(10,18))]).T
df.columns = ['mode','X','Y']
mode = df.groupby(['mode'])['mode'].mean()
Ox = df.groupby(['X'])['X'].mean()
Oy = df.groupby(['mode','X'])['Y'].mean()
for x in mode:
plt.plot(Ox, Oy[Oy['mode'== x]] , label = 'test' + x)
plt.savefig('testpandas.pdf')
You might want to try the seaborn package, which has a lot of functionality for stuff like this
import seaborn as sns
sns.lmplot(data=df,hue='mode',x='X',y='Y',x_estimator=np.mean)
Here's one way to do it in plain pandas:
y_means=df.groupby(['mode','X'],as_index=False).mean()
for mode,g in y_means.groupby('mode'):
plt.plot(g['X'],g['Y'],'o-',label = 'mode = ' + str(mode))
It's an answer of asking person.
Actually I've found solution by myself.
#!/usr/bin/python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame([[1,1,1,1,2,2,2,2],[3,4,3,4,3,4,3,4],list(range(10,18))]).T
df.columns = ['mode','X','Y']
mode = df.groupby(['mode'])['mode'].mean()
Ox = df.groupby(['X'])['X'].mean()
Oy = df.groupby(['mode','X'])['Y'].mean()
for x in mode:
plt.plot(Ox, Oy[mode[x]] , label = 'test' + str(x))
plt.savefig('testpandas.png')
I would guess the easiest way to do this is to use a pivot_table. This reduces the whole thing to two lines:
piv = pd.pivot_table(df, columns="mode", index="X")
plt.plot(piv)
or even only one, if you use pandas integrated plotting functionality:
pd.pivot_table(df, columns="mode", index="X").plot()
The complete solution using matplotlib:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame([[1,1,1,1,2,2,2,2],[3,4,3,4,3,4,3,4],list(range(10,18))]).T
df.columns = ['mode','X','Y']
piv = pd.pivot_table(df, columns="mode", index="X")
print piv
plt.plot(piv)
plt.legend(labels=["mode {}".format(c[1]) for c in piv.columns.values])
plt.show()
which prints the pivot table as
Y
mode 1 2
X
3 11 15
4 12 16
and creates the plot

Categories