Convert hours format to minutes (float) with Pandas - python

I'm following one tutorial for web scraping an I'm stuck with one part.
I'm only getting errors when I try to run the following code:
df7['Time2'] = df7['Time'].str.split(':').apply(lambda x: float(x[0]) * 60 + float(x[1]) + float(x[2])/60)
Get the error:
IndexError: list index out of range
Also tried the following:
time_mins = []
for i in time_list:
h, m, s = i.split(':')
math = (int(h) * 3600 + int(m) * 60 + int(s))/60
time_mins.append(math)
Again didn't work.
My cell is like:
The result that I want is like:
Any help would be helpful...
Tks in adv.

Create Sample Dataframe:
# Import packages
import pandas as pd
# Create sample dataframe
time = ['1:38:17','1:38:31','1:38:32']
gender = ['M','F','M']
data = pd.DataFrame({
'Time':time,
'Gender':gender
})
data
Out[]:
Time Gender
0 1:38:17 M
1 1:38:31 F
2 1:38:32 M
Convert column into timedelta format:
# Time conversion
data['Time'] = pd.to_timedelta(data['Time'])
# Time in days
data = data.assign(Time_in_days = [x.days for x in data['Time']])
# Time in hour
data = data.assign(Time_in_hour = [(x.seconds)/(60.0*60.0) for x in data['Time']] )
# Time in minutes
data = data.assign(Time_in_minutes = [(x.seconds)/60.0 for x in data['Time']])
# Time in seconds
data = data.assign(Time_in_seconds = [x.seconds * 1.0 for x in data['Time']] )
print(data)
Time Gender Time_in_days Time_in_hour Time_in_minutes Time_in_seconds
0 01:38:17 M 0 1.638056 98.283333 5897.0
1 01:38:31 F 0 1.641944 98.516667 5911.0
2 01:38:32 M 0 1.642222 98.533333 5912.0

data['Time2'] = data['Time'].apply(lambda x: sum([a*b for a,b in zip(list(map(int,x.split(':')))[::-1],[1/60,1,60])]))
If you have date['Time'] dtype as string if not then just make small change in above line :
x.str.split(':')

Related

How to display the average price of bitcoin in Pandas and not through a loop?

y = []
n = 0
days = 1
for i in btc['Adj Close']:
averagePrice = (i + n) / days
n += i
days += 1
y.append(averagePrice)
btc['TopAverage'] = y
If btc is a pandas data frame (as it appears so), then:
btc.loc[:, 'Days'] = list(range(1, btc.shape[0] + 1))
btc.loc[:, 'n'] = btc['Adj Close'].cumsum().shift(periods=1, fill_value=0.)
btc.loc[:, 'TopAverage'] = (btc['Adj Close'] + btc['n']) / btc['Days']
reflects the logic in your code. This will add the columns 'Days' and 'n' to the data frame as well.

Pyrhon KeyError: 101 when I try to calculate multiple forecasts for a timeseries

I want to use a time series in python and calculate n forecasts.
I tried to use a for cycle, but when I use n>=2 I get an error: "KeyError: 101"
I tried:
dateparse = lambda x: pd.datetime.strptime(x, '%YM%m')
df = pd.read_excel('test.csv', sheet_name=f'sheet_1', index_col=2, parse_dates=['date'], date_parser=dateparse)
ad = df['ad']
n = 2
k = 3
for x in range(n):
tot = len(ad)-1
adtf = 7 + 23*ad[tot-1] + 55*ad[tot-2] + 13*nu[tot-1] + 3*nu[tot-2]
indexf = ad.index[tot]
indexf += relativedelta(months=+1)
i = pd.Index([indexf])
ad = ad.append(pd.DataFrame({0:[adtf]}, index=i))
nu = nu.append(pd.DataFrame({0:[k]}, index=i))
print(ad)
PS: I have added nu = nu.append(pd.DataFrame({0:[k]}, index=i)) in order to have a value to use in the next cyle.

Duplicating rows in dataframe python

Good afternoon everyone,
I am currently writing a thesis on the KMV model in python. I took inspiration from the code here to solve the non-linear equations. Here is the link to the CSV file used to create the dataframe. And this is the code I have so far:
Importation of the required modules
from datetime import datetime
import pandas as pd
import numpy as np
import scipy.optimize as sco
from scipy.stats import norm
df = pd.DataFrame()
df = pd.read_csv("AREX.csv", sep=';', engine = "python", decimal=',')
Functions to prepare the file for the model to run
def clean():
# df.rename(columns ={"Date": "Date"}, inplace = True)
# df["Date"] = pd.to_datetime(df['Date'])
df.set_index("Date", inplace = True)
df['AREX.O']=df['AREX.O'].astype(float)
df.drop(['Total Short Term debt'], axis =1, inplace = True)
return df
def preparation():
df['e']=df['AREX.O']*df['Share Outstanding']
df['Short Term Debt']=df['Debt']-df['Total Long term Debt']
df['f']=df['Short Term Debt']+df['Total Long term Debt']*0.5
df['log_ret'] = np.log(df['AREX.O']) - np.log(df['AREX.O'].shift(1))
# df['stdev']=df['log_ret'].rolling(252).std()*m.sqrt(252)
return df
Algorithm used to solve for a and sigma_a.
I only tried to adapt the code to my dataframe here
def algo1():
# formatting the vaules as required
df["f"] = df["f"].astype(float)
df["e"] = df["e"].astype(float)
# #computating of key input variable for the model
df['a'] = df['f'].add(df["e"])
#defining a function for the black Scholes equation
def bseqn(a, debug=False):
d1 = (np.log(a/f) + (r + 0.5*sigma_a**2)*T)/(sigma_a*np.sqrt(T))
d2 = d1 - sigma_a*np.sqrt(T)
y1 = e - (a*norm.cdf(d1) - np.exp(-r*T)*f*norm.cdf(d2))
if debug:
print("d1 = {:.6f}".format(d1))
print("d2 = {:.6f}".format(d2))
print("Error = {:.6f}".format('y1'))
return y1
#Solving the model
time_horizon=[1]
timesteps = range(1, len(df))
results = np.empty((df.shape[0],len(time_horizon)))
#looping to solve for each row
for i, years in enumerate(time_horizon):
T = 1
results[:,i] = df.loc[:,'a']
for i_t, t in enumerate(timesteps):
a = results[t-10:t,i]
ra =np.log(a/np.roll(a,1))
sigma_a = np.nanstd(ra) #gives initial value of sigma_a
if i_t == 0:
subset_timesteps = range(t-1, t+1)
print(subset_timesteps)
else:
subset_timesteps = [t]
n_its = 0
while n_its < 10:
n_its += 1
for t_sub in subset_timesteps:
r = df.iloc[t_sub]['r']
f = df.iloc[t_sub]['f']
e = df.iloc[t_sub]['e']
sol = sco.fsolve(bseqn, results[t_sub,i]) #if I replace newton with fsolve the code works properly
results[t_sub,i] = sol # stores the new values of a
# Update sigma_a based on new values of a
last_sigma_a = sigma_a
a = results[t-10:t,i]
ra = np.log(a/np.roll(a,1))
sigma_a = np.nanstd(ra) #new val of sigma
diff = last_sigma_a - sigma_a
if abs(diff) < 1e-3:
df.loc[t_sub,'sigma_a'] = sigma_a
break
else:
pass
return df
Run function
def run():
clean()
preparation()
algo1()
print(df)
print(list(df))
# main_df = df.to_csv("AREX_D.csv")
The output should write the results of sigma_a on the created sigma_a column but instead of that it adds a row so instead of 1500 rows i end-up with 3000 rows most of it being Nan values. I do not understand where the code asks that...
I suspect it to come from these lines:
diff = last_sigma_a - sigma_a
if abs(diff) < 1e-3:
df.loc[t_sub,'sigma_a'] = sigma_a
break
Does anyone has any insight on what is happening ?
Here is a picture of the output :
Thank you very much!

handling real time data in python , rolling window

I want to create a function that will read a series of time values from a file (with gaps in the sampling rate,thats the problem) and would read me exactly 200 days and allow me to move through the entire data length,say 10000 day,sort of a rolling window.
I am not sure how to code it. Can I add a statement that calculates the difference between two values of the time variable (x axis) up to when is exactly 200 days?
Or can I somehow write a function that would find the starting value say t0 and then find the element of the array that is closest to t0 + (interval=) 200 days.
What I have so far is:
f = open(reading the file from directory)
lines = f.readlines()
print(len(lines))
tx = np.array([]) # times
y= np.array([])
interval = 200 # days
for li in lines:
col = li.split()
t0 = np.array([])
t1 = np.array([])
tx = np.append(tx, float(col[0]))
y= np.append(y, float(col[1]))
t0 = np.append(t0, np.max(tx))
t1 = np.append(t1, tx[np.argmin(tx)])
print(t0,t1)
days = [t1 + dt.timedelta(days = float(x)) for x in days]
#y = np.random.randn(len(days))
# use pandas for convenient rolling function:
df = pd.DataFrame({"day":tx, "value": y}).set_index("day")
def closest_value(s):
if s.shape[0]<2:
return np.nan
X = np.empty((s.shape[0]-1, 2))
X[:, 0] = s[:-1]
X[:, 1] = np.fabs(s[:-1]-s[-1])
min_diff = np.min(X[:, 1])
return X[X[:, 1]==min_diff, 0][0]
df['closest_value'] = df.rolling(window=dt.timedelta(days=200))
['value'].apply(closest_value, raw=True)
print(df.tail(5))
Output error:
TypeError: float() argument must be a string or a number, not
'datetime.datetime'
Additionally,
First 10 tx and ty values respectively:
0 0.003372722575018
0.015239999629557 0.003366515509113
0.045829999726266 0.003385171061055
0.075369999743998 0.003385171061055
0.993219999596477 0.003366515509113
1.022699999623 0.003378941085299
1.05217999964952 0.003369617612836
1.08166999975219 0.003397665493594
3.0025899996981 0.003378941085299
3.04120999993756 0.003394537568711
import numpy as np
import pandas as pd
import datetime as dt
# load data in days and y arrays
# ... or generate them:
N = 1000 # number of days
day_min = dt.datetime.strptime('2000-01-01', '%Y-%m-%d')
day_max = 2000
days = np.sort(np.unique(np.random.uniform(low=0, high=day_max, size=N).astype(int)))
days = [day_min + dt.timedelta(days = int(x)) for x in days]
y = np.random.randn(len(days))
# use pandas for convenient rolling function:
df = pd.DataFrame({"day":days, "value": y}).set_index("day")
def closest_value(s):
if s.shape[0]<2:
return np.nan
X = np.empty((s.shape[0]-1, 2))
X[:, 0] = s[:-1]
X[:, 1] = np.fabs(s[:-1]-s[-1])
min_diff = np.min(X[:, 1])
return X[X[:, 1]==min_diff, 0][0]
df['closest_value'] = df.rolling(window=dt.timedelta(days=200))['value'].apply(closest_value, raw=True)
print(df.tail(5))
Output:
value closest_value
day
2005-06-15 1.668638 1.591505
2005-06-16 0.316645 0.304382
2005-06-17 0.458580 0.445592
2005-06-18 -0.846174 -0.847854
2005-06-22 -0.151687 -0.166404
You could use pandas, set a datetime range and create a while loop to process the data in batches.
import pandas as pd
from datetime import datetime, timedelta
# Load data into pandas dataframe
df = pd.read_csv(filepath)
# Name columns
df.columns = ['dates', 'num_value']
# Convert strings to datetime
df.dates = pd.to_datetime(df['dates'], format='%d/%m/%Y')
# Print dates within a 200 day interval and move on to the next interval
i = 0
while i < len(df.dates):
start = df.dates[i]
end = start + timedelta(days=200)
print(df.dates[(df.dates >= start) & (df.dates < end)])
i += 200
If the columns don't have headers, you should omit skiprows:
dates num_value
2004-7-1 1
2004-7-2 5
2004-7-4 8
2004-7-5 11
2004-7-6 17
df = pd.read_table(filepath, sep="\s+", skiprows=1)

How to include dynamic time?

I am trying to pull the logs with respect to time slots. The program below runs very fine when no. of hours are given and the logs in that range gets extracted.
But now I also what to include Start and end to be dynamically given. i.e. say between 8 am to 8pm or 6am to 8am and so on.
How do I get that? Any edit in the current program will also do or a separate program will also do.
Input: Mini Version of INPUT
Code:
import pandas as pd
from datetime import datetime,time
import numpy as np
fn = r'00_Dart.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']
df = pd.read_csv(fn, header=None, names=cols)
df['m'] = df.StopTime + df.StartTime
df['d'] = df.StopTime - df.StartTime
# 'start' and 'end' for the reporting DF: `r`
# which will contain equal intervals (1 hour in this case)
start = pd.to_datetime(df.StartTime.min(), unit='s').date()
end = pd.to_datetime(df.StopTime.max(), unit='s').date() + pd.Timedelta(days=1)
# building reporting DF: `r`
freq = '1H' # 1 Hour frequency
idx = pd.date_range(start, end, freq=freq)
r = pd.DataFrame(index=idx)
r['start'] = (r.index - pd.datetime(1970,1,1)).total_seconds().astype(np.int64)
# 1 hour in seconds, minus one second (so that we will not count it twice)
interval = 60*60 - 1
r['LogCount'] = 0
r['UniqueIDCount'] = 0
for i, row in r.iterrows():
# intervals overlap test
# https://en.wikipedia.org/wiki/Interval_tree#Overlap_test
# i've slightly simplified the calculations of m and d
# by getting rid of division by 2,
# because it can be done eliminating common terms
u = df[np.abs(df.m - 2*row.start - interval) < df.d + interval].UserID
r.ix[i, ['LogCount', 'UniqueIDCount']] = [len(u), u.nunique()]
r['Date'] = pd.to_datetime(r.start, unit='s').dt.date
r['Day'] = pd.to_datetime(r.start, unit='s').dt.weekday_name.str[:3]
r['StartTime'] = pd.to_datetime(r.start, unit='s').dt.time
r['EndTime'] = pd.to_datetime(r.start + interval + 1, unit='s').dt.time
#r.to_csv('results.csv', index=False)
#print(r[r.LogCount > 0])
#print (r['StartTime'], r['EndTime'], r['Day'], r['LogCount'], r['UniqueIDCount'])
rout = r[['Date', 'StartTime', 'EndTime', 'Day', 'LogCount', 'UniqueIDCount'] ]
#print rout
rout.to_csv('one_hour.csv', index=False, header=False)
Edit:
In Simple words, I should be able to give StartTime and EndTIme in the program. The code below is very much close to what I am trying to do. But how convert this to pandas.
from datetime import datetime,time
start = time(8,0,0)
end = time(20,0,0)
with open('USC28days_0_20', 'r') as infile, open('USC28days_0_20_time','w') as outfile:
for row in infile:
col = row.split()
t1 = datetime.fromtimestamp(float(col[2])).time()
t2 = datetime.fromtimestamp(float(col[3])).time()
print (t1 >= start and t2 <= end)
Edit Two: Working answer in Pandas
Taking a Part from the #MaxU's answer from selected answer. The below code strips the required group of logs between the given StartTime and StopTime
import pandas as pd
from datetime import datetime,time
import numpy as np
fn = r'00_Dart.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']
df = pd.read_csv(fn, header=None, names=cols)
#df['m'] = df.StopTime + df.StartTime
#df['d'] = df.StopTime - df.StartTime
# filter input data set ...
start_hour = 8
end_hour = 9
df = df[(pd.to_datetime(df.StartTime, unit='s').dt.hour >= start_hour) & (pd.to_datetime(df.StopTime, unit='s').dt.hour <= end_hour)]
print df
df.to_csv('time_hour.csv', index=False, header=False)
But: If there was a possibility to have control on minutes and seconds also would be great solution.
At present this also strips the logs which have the hour of StopTime but also the minutes and seconds until the next hour.
Something like
start_hour = 8:0:0
end_hour = 9:0:0 - 1 # -1 to get the logs until 8:59:59
But this gives me an error
try this:
import pandas as pd
from datetime import datetime,time
import numpy as np
fn = r'D:\data\gDrive\data\.stack.overflow\2016-07\dart_small.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']
df = pd.read_csv(fn, header=None, names=cols)
df['m'] = df.StopTime + df.StartTime
df['d'] = df.StopTime - df.StartTime
# filter input data set ...
start_hour = 8
end_hour = 20
df = df[(pd.to_datetime(df.StartTime, unit='s').dt.hour >= 8) & (pd.to_datetime(df.StartTime, unit='s').dt.hour <= 20)]
# 'start' and 'end' for the reporting DF: `r`
# which will contain equal intervals (1 hour in this case)
start = pd.to_datetime(df.StartTime.min(), unit='s').date()
end = pd.to_datetime(df.StopTime.max(), unit='s').date() + pd.Timedelta(days=1)
# building reporting DF: `r`
freq = '1H' # 1 Hour frequency
idx = pd.date_range(start, end, freq=freq)
r = pd.DataFrame(index=idx)
r = r[(r.index.hour >= start_hour) & (r.index.hour <= end_hour)]
r['start'] = (r.index - pd.datetime(1970,1,1)).total_seconds().astype(np.int64)
# 1 hour in seconds, minus one second (so that we will not count it twice)
interval = 60*60 - 1
r['LogCount'] = 0
r['UniqueIDCount'] = 0
for i, row in r.iterrows():
# intervals overlap test
# https://en.wikipedia.org/wiki/Interval_tree#Overlap_test
# i've slightly simplified the calculations of m and d
# by getting rid of division by 2,
# because it can be done eliminating common terms
u = df[np.abs(df.m - 2*row.start - interval) < df.d + interval].UserID
r.ix[i, ['LogCount', 'UniqueIDCount']] = [len(u), u.nunique()]
r['Date'] = pd.to_datetime(r.start, unit='s').dt.date
r['Day'] = pd.to_datetime(r.start, unit='s').dt.weekday_name.str[:3]
r['StartTime'] = pd.to_datetime(r.start, unit='s').dt.time
r['EndTime'] = pd.to_datetime(r.start + interval + 1, unit='s').dt.time
#r.to_csv('results.csv', index=False)
#print(r[r.LogCount > 0])
#print (r['StartTime'], r['EndTime'], r['Day'], r['LogCount'], r['UniqueIDCount'])
rout = r[['Date', 'StartTime', 'EndTime', 'Day', 'LogCount', 'UniqueIDCount'] ]
#print rout
OLD answer:
from_time = '08:00'
to_time = '18:00'
rout.between_time(from_time, to_time).to_csv('one_hour.csv', index=False, header=False)

Categories