How to specify and locate cells using Pandas and use fillna - python

I am using a dataset that can be found on Kaggle website (https://www.kaggle.com/claytonmiller/lbnl-automated-fault-detection-for-buildings-data).
I am trying to write a code that can specify based on Timestamp to look for those specific rows and apply a condition (In the context of this dataset the time between 10:01 PM to 6:59 AM) and fill all the columns corresponding to those specific rows with zero.
I have tried the following code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%matplotlib inline
df = pd.read_csv('RTU.csv')
def fill_na(row):
if dt.time(22, 1) <= pd.to_datetime(row['Timestamp']).time() <= dt.time(6, 59):
row.fillna(0)
### df = df.apply(fill_na, axis=1) ###
df= df.apply(lambda row : fill_na(row), axis=1)
#### df.fillna(0, inplace=True) ###
df.head(2000)
However after changing the axis of the dataset it seems it can no longer work as intended.

I don't think you need a function to do that. Just filter the rows using a condition and then fillna.
import datetime as dt
import pandas as pd
df = pd.read_csv('RTU.csv',parse_dates=['Timestamp'])
df.head()
cond = (df.Timestamp.dt.time > dt.time(22,0)) | ((df.Timestamp.dt.time < dt.time(7,0)))
df[cond] = df[cond].fillna(0,axis=1)
Shows that the na before 7am fill with 0

Related

Pandas Dataframe replace outliers

Thank you in advance for your help! (Code Provided Below) (Data Here)
I would like to remove the outliers outside of 5/6th standard deviation for columns 5 cm through 225 cm and replace them with the average value for that date (Month/Day) and depth. What is the best way to do that?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
raw_data = pd.read_csv('all-deep-soil-temperatures.csv', index_col=1, parse_dates=True)
df_all_stations = raw_data.copy()
df_selected_station.fillna(method = 'ffill', inplace=True);
df_selected_station_D=df_selected_station.resample(rule='D').mean()
df_selected_station_D['Day'] = df_selected_station_D.index.dayofyear
mean=df_selected_station_D.groupby(by='Day').mean()
mean['Day']=mean.index
mean.head()
For a more general solution, assuming that you are given a dataframe df with some column a.
from scipy import stats.
df[np.abs(stats.zscore(df['a'])) > 5]['a'] = df['a'].mean()

Selecting between timestamps pandas

Hello I cannot understand why this code does not select rows between dates. It shows me same dataset from first date 2004. Here is my code below:
import pandas as pd
from pandas import DataFrame
import datetime
from matplotlib import pyplot as plt
df1 = pd.read_csv('time_series_15min_singleindex.csv',header=0,index_col=0,parse_dates=True)
df=DataFrame(df1,columns['utc_timestamp','DE_solar_generation_actual','DE_wind_onshore_generation_actual']
df['utc_timestamp'] = pd.to_datetime(df['utc_timestamp'],utc=True)
start_date=pd.to_datetime('2008-12-31',utc=True)
end_date=pd.to_datetime('2009-01-01',utc=True)
df[df['utc_timestamp'].between(start_date,end_date)]
df.plot()
You forget assign back, use:
df = df[df['utc_timestamp'].between(start_date,end_date)]

Getting the Date attribute correct for subgrouping and regression lmplot

Given the following data from a CSV file, I want to plot a regression plot using Matlab for the mean of the 2-bedroom price.
I have managed to use subgroup to get the mean. However, after reading the solution from Stackoverflow and trying it, I mostly end up with other never-ending data-related problems. In general most of the errors are either to convert it to string or it is not index etc.
Bedrooms Price Date
0 2.0 NaN 3/9/2016
1 1480000.0 3/12/2016
2 2.0 1035000.0 4/2/2016
3 3.0 NaN 4/2/2016
4 3.0 1465000.0 4/2/2016
#Assume you have the following dataframe df that describes flights
%matplotlib inline
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('justtesting.csv', nrows=50, usecols=['Price','Date','Bedrooms'])
df = df.dropna(0)e
df['Date'] = pd.to_datetime(df.Date)
df.sort_values("Date", axis = 0, ascending = True, inplace = True)
df2 = df[df['Bedrooms'] == 2].groupby(["Date"]).agg(['sum'])
df2.head()
df2.info()
sns.set()
g=sns.lmplot(x="Date", y="Price", data=df2, lowess=True)
#Assume you have the following dataframe df that describes flights
%matplotlib inline
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = x.copy()
df = df.dropna(0)
df.sort_values("Date", axis = 0, ascending = True, inplace = True)
df2 = df[df['Bedrooms'] == 2].groupby(["Date", 'Bedrooms'], as_index=False).sum()
df2.head()
df2.info()
sns.set()
g=sns.lmplot(x='Date', y="Price", data=df2, lowess=True)
Groupby makes the grouped by columns as index by default. Giving as_index=False will fix that. However, seasborn lmplot is required to have a float value. More info can be found on this question

matplotlib dataframe x axis date issue

import sys
import ConfigParser
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as DT
import bokeh
sys.path.extend(['..\..\myProj\SOURCE'])
fullfilepath = "../../myProj/SOURCE/" + 'myparts.txt'
ohg_df = pd.read_csv(fullfilepath, sep="\t" )
temp_df = temp_df[['as_on_date', 'ohg_qty']]
temp_df = temp_df.sort(['as_on_date'], ascending=[1])
temp_df.set_index('as_on_date')
plt.plot(temp_df.index, temp_df.ohg_qty)
plt.show()
This is my dataframe after importing.
I am trying to plot the line graph with x axis as date mentioned in the dataframe.
Can someone guide me... I am new to pandas.
dataframe picture
output pitcure
Easier:
# Set index directly
ohg_df = pd.read_csv(fullfilepath, sep="\t", index='as_on_date')
# Convert string index to dates
ohg_df.index = pd.to_datetime(ohg_df.index)
# Get a column and plot it (taking a column keeps the index)
plt.plot(ohg_df.ohg_qty)

Pandas index column for a dataframe

Hi all so I'm trying to work with this set of data that has two columns, one is names and the other is the number of births for each name. What I want to do is import a csv file, perform some basic functions on it such as finding the baby name with the maximum number of births, and then plotting the data in a bar graph. But, when I have an index value for the dataframe, the bar graph prints that as the x axis instead of the names. So I removed the index and now I get all kinds of errors. Below is my code, first the one with the index and then the one without. Thanks in advance. This is really driving me crazy
import pandas as pd
import matplotlib.pyplot as plt
import pdb
import matplotlib as p
import os
from pandas import DataFrame
Location = os.path.join(os.path.sep,'Users', 'Mark\'s Computer','Desktop','projects','data','births1880.csv')
a = pd.read_csv(Location, index_col = False)
print(a) #print the dataframe just to see what I'm getting.
MaxValue = a['Births'].max()
MaxName = a['Names'][a['Births'] == MaxValue].values
print(MaxValue, ' ', MaxName)
a.plot(kind ='bar')
plt.show()
This code works but spits out a bar graph with the index as the x axis instead of the names?
import pandas as pd
import matplotlib.pyplot as plt
import pdb
import matplotlib as p
import os
from pandas import DataFrame
Location = os.path.join(os.path.sep,'Users', 'Mark\'s Computer','Desktop','projects','data','births1880.csv')
a = pd.read_csv(Location, index_col = True) #why is setting the index column to true removing it?
print(a) #print the dataframe just to see what I'm getting.
MaxValue = a['Births'].max()
MaxName = a['Names'][a['Births'] == MaxValue].values
print(MaxValue, ' ', MaxName)
a.plot(kind ='bar', x='Names', y = 'Births' )
plt.show()
edited for solution.
It would be nice if you'd provided a sample csv file, so I made one up, took me a while to figure out what format pandas expects.
I used a test.csv that looked like:
names,briths
mike,3
mark,4
Then my python code:
import pandas
import numpy
import matplotlib.pyplot as plt
a = pandas.read_csv('test.csv', index_col = False)
a.plot(kind='bar')
indices = numpy.arange(len(a['names']))
plt.xticks( indices+0.5, a['names'].values)
plt.show()

Categories