Split data frame in python based on one parameter shape - python

I have a data frame which is like the following :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
df_input = pd.read_csv('combine_input.csv', delimiter=',')
df_output = pd.read_csv('combine_output.csv', delimiter=',')
In this data frame, there are many repeated rows for example the first row is repeated more than 1000 times, and so on for the other rows
when I plot the time distribution I got that figure which shows that the frequency of the time parameter
df_input.plot(y='time',kind = 'hist',figsize=(10,10))
plt.grid()
plt.show()
My question is how can I take the data only in the following red rectangular for example at time = 0.006 and frequency = 0.75 1e6 ( check the following pic )

Note: InPlace of target you have to write time as your column name Is time,or change column name to target
def calRows(df,x,y):
#df For consideration
df1 = pd.DataFrame(df.target[df.target<=x])
minCount = len(df1)
targets = df1.target.unique()
for i in targets:
count = int(df1[df1.target == i].count())
if minCount > count:
minCount = count
if minCount > y:
minCount = int(y)
return minCount
You have To pass your data frame, x-intercept of the graph, y-intercept of graph to calRows(df,x,y) function which will return the number of rows to take for each target.
rows = CalRows(df,6,75)
print(rows)
takeFeatures(df,rows,x) function will take dataframe, rows (result of first function), x-intercept of graph and will return you the final dataframe.
def takeFeatures(df,rows,x):
finalDf = pd.DataFrame(columns = df.columns)
df1 = df[df.target<=x]
targets = df1.target.unique()
for i in targets:
targeti = df1[df1.target==i]
sample = targeti.sample(rows)
finalDf = pd.concat([finalDf,sample])
return finalDf
Calling takeFeature() Function
final = takeFeatures(df,rows,6)
print(final)
Your Final DataFrame will have the Values ThatYou expected in Graph
And After Plotting this final dataframe you will get like this graph

Related

Programming a prediction model, code runs but doesnt give output

My code runs properly but it will not provide output as it should. I am not sure where the issue is occurring. Could someone help me correct it? Do you need the CSV too?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv("/content/drive/MyDrive/replicates/Replicate 3 Gilts just measures.csv")
df.info()
df.head()
# removing the irrelevant columns
cols_to_drop = ["animal"]
df = df.drop(columns=cols_to_drop,axis=1)
# first five rows of data frame after removing columns
df.head()
deep_df = df.copy(deep = True)
numerical_columns = [col for col in df.columns if (df[col].dtype=='int64' or
df[col].dtype=='float64')]
df[numerical_columns].describe().loc[['min','max', 'mean','50%'],:]
df[df['i1000.0'] == df['i1000.0'].min()]
This is where the issue occurs
i1000_bucket = df.groupby(pd.cut(df["i1000.0"],bins=[10,20,30,40,50,60,70,80,90,100]))
number_bucket = df.groupby(pd.cut(df["i1000.0"],bins=[10,20,30,40,50,60,70,80,90,100]))
i1000_bucket = ((i1000_bucket.sum()["i1000.0"] / i1000_bucket.size())*100 , 2)
number_bucket = round((number_bucket.sum()["i1000.0"] / number_bucket.size())*100 , 2)
The graph appears but nothing actually plots
x = [str(i)+"-"+str(i+10) for i in range(10,91,10)]
plt.plot(x,number_bucket.values)
plt.xlabel("i1000.0")
plt.ylabel("p1000.0")
plt.title("1000.0 comparisons")

Generating and Storing Samples of an Exponential Distribution with a name for each sample using a loop

I've got a weird question for a class project. Assuming X ~ Exp(Lambda), Lambda=1.6, I have to generate 100 samples of X, with the indices corresponding to the sample size of each generated sample (S1, S2 ... S100). I've worked out a simple loop which generate the required samples in array, but i am not able to rename the array.
First attempt:
import numpy as np
import matplotlib.pyplot as plt
samples = []
for i in range(1,101,1):
samples.append(np.random.exponential(scale= 1/1.6, size= i))
Second attempt:
import numpy as np
import matplotlib.pyplot as plt
for i in range(1,101,1):
samples = np.random.exponential(scale= 1/1.2, size= i)
col = f'samples {i}'
df_samples[col] = exponential_sample
df_samples = pd.DataFrame(samples)
An example how I would like to visualize the data:
# drawing 50 random samples of size 2 from the exponentially distributed population
sample_size = 2
df2 = pd.DataFrame(index= ['x1', 'x2'] )
for i in range(1, 51):
exponential_sample = np.random.exponential((1/rate), sample_size)
col = f'sample {i}'
df2[col] = exponential_sample
# Taking a peek at the samples
df2
But instead of having a simple size = 2, I would like to have sample size = i. This way, I will be able to generate 1 rows for the first column (S1), 2 rows for the second column (S2), until I reach 100 rows for the 100th column (S100).
You cannot stick vectors of different lengths easily into a df so your mock-up code would not work, but you can concat one vector at a time:
df = pd.DataFrame()
for i in range(100,10100,100):
tmp = pd.DataFrame({f'S{i}':np.random.exponential(scale= 1/1.2, size= i)})
df = pd.concat([df, tmp], axis=1)
Use a dict instead maybe?
samples = {}
for i in range(100,10100,100):
samples[i] = np.random.exponential(scale= 1/1.2, size= i)
Then you can convert it into a pandas Dataframe if you like.

Display summary statistics in barplot using ggplot/plotnine

In the following simplified example, I wish to display the sum of each stacked barplot (3 for A and 7 for B), yet my code displays all the values, not the summary statistics. What am I doing wrong? Thank you in advance.
import io
import pandas as pd
import plotnine as p9
data_string = """V1,V2,value
A,a,1
A,b,2
B,a,3
B,b,4"""
data = io.StringIO(data_string)
df = pd.read_csv(data, sep=",")
p9.ggplot(df, p9.aes(x='V1', y='value', fill = 'V2')) + \
p9.geom_bar(stat = 'sum') + \
p9.stat_summary(p9.aes(label ='stat(y)'), fun_y = sum, geom = "text")
The issue is the grouping of your data. As you have a global fill aesthetic your data gets grouped by categories of V2. Hence stat_summary computes the sum per group of V2. To solve this issue make fill a local aesthetic of geom_bar or geom_col.
import io
import pandas as pd
import plotnine as p9
data_string = """V1,V2,value
A,a,1
A,b,2
B,a,3
B,b,4"""
data = io.StringIO(data_string)
df = pd.read_csv(data, sep=",")
p9.ggplot(df, p9.aes(x='V1', y='value')) + \
p9.geom_col(p9.aes(fill = 'V2')) + \
p9.stat_summary(p9.aes(label ='stat(y)'), fun_y = sum, geom = "text")
Another option would be to override the global grouping by setting group=1 in stat_summary:
p9.stat_summary(p9.aes(label ='stat(y)', group = 1), fun_y = sum, geom = "text")

Plotly: How to plot markers with time values on a different trace?

I have 2 data frames:
df1 contains columns: “time”, “bid_price”
df2 contains columns: “time”, “flag”
I want to plot a time series of df1 as a line graph and i want to put markers on that trace at points where df2 “flag” column value = True at those points in time
How can i do this?
You can do so in three steps:
set up a figure using go.Figure(),
add a trace for your bid_prices using fig.update(go.Scatter)
and do the same thing for your flags.
The snippet below does exactly what you're describing in your question. I've set up two dataframes df1 and df2, and then I've merged them together to make things a bit easier to reference later on.
I'm also showing flags for an accumulated series where each increment in the series > 0.9 is flagged in flags = [True if elem > 0.9 else False for elem in bid_price] . You should be able to easily adjust this to whatever your real world dataset looks like.
Plot:
Complete code with random data:
# imports
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import random
# settings
observations = 100
np.random.seed(5); cols = list('a')
bid_price = np.random.uniform(low=-1, high=1, size=observations).tolist()
flags = [True if elem > 0.9 else False for elem in bid_price]
time = [t for t in pd.date_range('2020', freq='D', periods=observations).format()]
# bid price
df1=pd.DataFrame({'time': time,
'bid_price':bid_price})
df1.set_index('time',inplace = True)
df1.iloc[0]=0; d1f=df1.cumsum()
# flags
df2=pd.DataFrame({'time': time,
'flags':flags})
df2.set_index('time',inplace = True)
df = df1.merge(df2, left_index=True, right_index=True)
df.bid_price = df.bid_price.cumsum()
df['flagged'] = np.where(df['flags']==True, df['bid_price'], np.nan)
# plotly setup
fig = go.Figure()
# trace for bid_prices
fig.add_traces(go.Scatter(x=df.index, y=df['bid_price'], mode = 'lines',
name='bid_price'))
# trace for flags
fig.add_traces(go.Scatter(x=df.index, y=df['flagged'], mode = 'markers',
marker =dict(symbol='triangle-down', size = 16),
name='Flag'))
fig.update_layout(template = 'plotly_dark')
fig.show()

Python Index column doesn't freeze while scrolling to the right

my problem is that I have a Dataframe of 200 rows and 200 columns, while I scroll to the right the index column stay fixed ( I can still see it) as it should be.
However when I select a column or value into the Dataframe (for example to order the values in ascending or descending order), the index column change and becomes the same as the column I selected.
I would like to still see the index column.
I am using Spyder 3.3.0 and Python 3.6
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import operator
# Importing the dataset
dataset = pd.read_csv('1992_2014.csv', index_col =0)
nations_all = dataset.iloc[:, 0].values
nations = [nations_all[0]]
for i in range(0, len(nations_all)):
if nations_all[i] not in nations:
nations.append(nations_all[i])
Year = dataset.iloc[:, 1].values
CO2 = dataset.iloc[:, 8].values
# Creating the Trend Matrix between two nations
trend_matrix = pd.DataFrame(index = nations, columns = nations)
for i in nations:
n = dataset[dataset["Nation"] == i].index.values.astype(int)
for k in nations:
kn = dataset[dataset["Nation"] == k].index.values.astype(int)
div_n = CO2[n[0]]
div_kn = CO2[kn[0]]
CO2_n = (CO2[n]/div_n)
CO2_kn = (CO2[kn]/div_kn)
trend_matrix.loc[i, k] = sum(list(map(abs,list(map(operator.sub, CO2_n, CO2_kn)))))
Thanks!

Categories