Remove outlier from time series data using pandas - python

I have one-minute data:
# Import data
import yfinance as yf
data = yf.download(tickers="MSFT", period="7d", interval="1m")
print(data.tail())
I would like to remove observations where minute difference is grater than daily difference, where we refere to day of the minute bar. I would like to apply this rule on every column except volume. Begining of the code:
minute_diff = data.diff()
dail_diff = data.resample('D').last().diff().median()
# here remove rows from data were minute_diff is grater than daily diff

minute_diff = data.diff().reset_index()
dail_diff = data.resample('D').last().diff().median()
cols = minute_diff.columns.to_list()
cols.remove('Datetime')
for c in cols:
minute_diff = minute_diff[(minute_diff[c] <= dail_diff[c])|(minute_diff[c].isnull())]
data = data.loc[minute_diff['Datetime']]

import pandas as pd
# Import data
import yfinance as yf
data = yf.download(tickers="MSFT", period="7d", interval="1m")
data_minute = data.copy()
data_minute['Date'] = data_minute.index.astype('datetime64[ns]')
data_minute['Date'] = data_minute['Date'].dt.normalize()
#Create new column for difference of current close minus previous close
data_minute['Minute Close Difference'] = data_minute['Close'] - data_minute['Close'].shift(1)
#Convert minute data to daily data
data_daily = data_minute.resample('D').agg({'Open':'first',
'High':'max',
'Low':'min',
'Close':'last',
'Adj Close':'last',
'Volume':'sum'
})
data_daily['Date'] = data_daily.index.astype('datetime64[ns]')
data_daily['Date'] = data_daily['Date'].dt.normalize()
data_daily = data_daily.set_index('Date')
#Create new column for difference of current close minus previous close
data_daily['Daily Close Difference'] = data_daily['Close'] - data_daily['Close'].shift(1)
data_minute = pd.merge(data_minute,data_daily['Daily Close Difference'],how = 'left', left_on = 'Date', right_index = True)
data_minute = data_minute[data_minute['Minute Close Difference'].abs() <= data_minute['Daily Close Difference'].abs()]
data_minute

I have found the solution:
daily_diff = data.resample('D').last().dropna().diff() * 25
daily_diff['diff_date'] = daily_diff.index.strftime('%Y-%m-%d')
data_test = data.diff()
data_test['diff_date'] = data_test.index.strftime('%Y-%m-%d')
data_test_diff = pd.merge(data_test, daily_diff, on='diff_date')
data_test_final = data_test_diff.loc[(np.abs(data_test_diff['close_x']) < np.abs(data_test_diff['close_y']))]
data_test_final['close_x'].plot()
indexer = (np.abs(data_test_diff['close_x']) < np.abs(data_test_diff['close_y']))
data_final = data.loc[indexer.values, :]

Related

Trouble with dimensions in netcdf : index exceeds dimension bounds

I want to extract monthly temperature data from several netCDF files in different locations. Files are built as follows:
> print(data.variables.keys())
dict_keys(['lon', 'lat', 'time', 'tmp','stn'])
Files hold names like "tmp_1901_1910."
Here is the code I use:
import glob
import pandas as pd
import os
import numpy as np
import time
os.chdir('PATH/data_tmp')
all_years = []
for file in glob.glob('*.nc'):
data = Dataset(file,'r')
time_data = data.variables['time'][:]
time = data.variables['time']
year = str(file)[4:13]
all_years.append(year)
# Empty pandas dataframe
year_start = min(all_years)
end_year = max(all_years)
date_range = pd.date_range(start = str(year_start[0:4]) + '-01-01', end = str(end_year[5:9]) + '-12-31', freq ='M')
df = pd.DataFrame(0.0, columns = ['Temp'], index = date_range)
# Defining the location, lat, lon based on the csv data
cities = pd.read_csv(r'PATH/cities_coordinates.csv', sep =',')
cities['city']= cities['city'].map(str)
for index, row in cities.iterrows():
location = row['code_nbs']
location_latitude = row['lat']
location_longitude = row['lon']
# Sorting the list
all_years.sort()
for yr in all_years:
#Reading in the data
data = Dataset('tmp_'+str(yr)+'.nc','r')
# Storing the lat and lon data into variables of the netCDF file into variables
lat = data.variables['lat'][:]
lon = data.variables['lon'][:]
# Squared difference between the specified lat, lon and the lat, lon of the netCDF
sq_diff_lat = (lat - location_latitude)**2
sq_diff_lon = (lon - location_longitude)**2
# Retrieving the index of the min value for lat and lon
min_index_lat = sq_diff_lat.argmin()
min_index_lon = sq_diff_lon.argmin()
# Accessing the temperature data
tmp = data.variables['tmp']
start = str(yr[0:4])+'-01-01'
end = str(yr[5:11])+'-12-31'
d_range = pd.date_range(start = start, end = end, freq='M')
for t_index in np.arange(0, len(d_range)):
print('Recording the value for: '+str(d_range[t_index]))
df.loc[d_range[t_index]]['Temp']=tmp[min_index_lon, min_index_lat, t_index]
df.to_csv(location +'.csv')
I obtain the following message while running the command df.loc[d_range[t_index]]['Temp']=tmp[min_index_lon, min_index_lat, t_index]
IndexError: index exceeds dimension bounds
I inspect the object's values and have:
print(d_range)
DatetimeIndex(['1901-01-31', '1901-02-28', '1901-03-31', '1901-04-30',
'1901-05-31', '1901-06-30', '1901-07-31', '1901-08-31',
'1901-09-30', '1901-10-31',
...
'1910-03-31', '1910-04-30', '1910-05-31', '1910-06-30',
'1910-07-31', '1910-08-31', '1910-09-30', '1910-10-31',
'1910-11-30', '1910-12-31'],
dtype='datetime64[ns]', length=120, freq='M')
On the first t_index within the loop, I have:
print(t_index)
0
print(d_range[t_index])
1901-01-31 00:00:00
print(min_index_lat)
259
print(min_index_lon)
592
I don't understand what went wrong with the dimensions.
Thank you for any help!
I assume, you want to read in all .nc data and map the closest city to it. For that, I suggest to read all data first and afterwards calculate to which city a location belongs. The following code probably needs some adoptions to your data. It should show in which direction you could go to get the code more robust.
Step 1: Import your 'raw' data
e.g. into a DataFrame(s). Depends if you can import all data at once. If not split step 1 and 2 into chunks
df_list = []
for file in glob.glob('*.nc'):
data = Dataset(file,'r')
df_i = pd.DataFrame({
variables.keys())
'time': data.variables['time'][:],
'lat': data.variables['lat'][:],
'lon': data.variables['lon'][:],
'tmp': data.variables['tmp'][:],
'stn': data.variables['stn'][:],
'year': str(file)[4:13], # maybe not needed as 'time' should have this info already, and [4:13] needs exactly this format
'file_name': file, # to track back the file
# ... and more
})
df_list.append(df_i)
df = pandas.concat(df_list, ignore_index=True)
Second step: map the locations
e.g. with groupby but there are several other methods. Depending on the amount of data, I suggest to use pandas or numpy routines over any python loops. They are way faster.
df['city'] = None
gp = df.groupby(['lon', 'lat'])
for values_i, indexes_i in gp.groups.items():
# Add your code to get the closest city
# values_i[0] is 'lon'
# values_i[1] is 'lat'
# e.g.:
diff_lon_lat = np.hypot(cities['lon']-values_i[0], cities['lat']-values_i[1])
location = cities.loc[diff_lon_lat.argmin(), 'code_nbs']
# and add the parameters to the df
df.loc[indexes_i, 'city'] = location

Why using statmodels.adfuller when using a for to iterate through a dataframe and selects the time series give me an error?

The problem here is to test if all stocks are integrated of order 1 , I(1), then search for cointegrated pairs. Until now, I'm just testing if they're I(1) using ADF test, but is not working correctly
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib as ptl
import statsmodels.tsa.stattools as ts
#List of the 50 companies with most % in IBOVESPA at 05-02-2022
Stocks = ["VALE3","PETR4","ITUB4","BBDC4","PETR3","B3SA3","ABEV3","JBSS3","BBAS3","WEGE3","ITSA4","HAPV3","SUZB3","RENT3"
,"GGBR4","BPAC11","RDOR3","EQTL3","CSAN3","VBBR3","LREN3","BBDC3","RADL3","PRIO3","VIVT3","RAIL3","ENEV3","BBSE3","KLBN11","TOTS3"
,"CMIG4","NTCO3","HYPE3","SBSP3","BRFS3","ELET3","AMER3","UGPA3","MGLU3","CCRO3","CSNA3","ASAI3","ENGI11","SANB11","TIMS3","CPLE6"
,"EGIE3","BRKM5","EMBR3","ELET6"]
Stocks_SA = []
for tickers in Stocks:
new_i = f'{tickers}.SA'
Stocks_SA.append(new_i)
def download_data(List):
data = pd.DataFrame()
names = []
for i in List:
df = pd.DataFrame(yf.download(i,start = "2020-04-30", end = "2021-04-30"))
df = df.dropna()
df["Adj Close"] = np.log(df["Adj Close"])
df2 = df.iloc[:,4]
data = pd.concat([data,df2],axis =1)
names.append(i)
data.columns = names
return data
s_data = download_data(Stocks_SA)
import statsmodels.tsa.stattools as ts
def Testing_ADF(data): #Test if all stocks are integrated in order one I(1)
names = data.columns.values.tolist()
n = data.shape[1]
I_one = []
keys = data.keys()
for n in names:
series = data[n]
result_adf = ts.adfuller(series)
if result_adf[1]> 0.05:
I_one.append(n)
return I_one
I_one_list = Testing_ADF(s_data)
I_one_list
When i run Testing_ADF(s_data) I get MissingDataError: exog contains inf or nansbut if I run just this code it works perfectly:
df = pd.DataFrame(yf.download("VALE3.SA",start = "2020-04-30", end = "2021-04-30"))
#df2 = ts.adfuller(df)
df["Adj Close"] = np.log(df["Adj Close"])
df2 = df.iloc[:,4]
df2.dropna()
adfuller = ts.adfuller(df2)
adfuller
So, why it works in one and not in the other one? and How can I fix it?

How do I create a dataframe using one variable being live streamed?

I am streaming live price data using the IB API, and I want to put it in a dataframe for analysis. My data consists of a price being live streamed with no timestamp.
I think I need to create new rows using row numbers that are automatically added, and have the prices inserted in the price column.
I have tried defining the dataframe and telling the price where to go as follows:
def tick_df(self, reqId,
contract): # this stores price dataframe by creating an empty dataframe and setting the index to the time column
self.bardata[reqId] = pd.DataFrame(columns=['index', 'price'])
self.reqMktData(reqId, contract, "", False, False, [])
self.bardata[reqId].index = [x for x in range(1, len(self.bardata[reqId].values) + 1)]
return self.bardata[reqId]
def tickPrice(self, reqId, tickType, price, attrib): # this function prints the price
if tickType == 2 and reqId == 102:
self.bardata[reqId].loc[self.bardata[reqId].index] = price
I have been using a methodology similar to here (https://github.com/PythonForForex/Interactive-brokers-python-api-guide/blob/master/GOOG_five_percent.py). However, as I am only streaming a price, I am unable to use the timestamp for creating new rows.
I don't know if this is what you need. In a loop I generate random price that I append to a data frame.
import numpy as np
import pandas as pd
_price = 1.1300 # first price in the series
_std = 0.0005 # volatility (stadard deviation)
df = pd.DataFrame(columns=['price'])
for i in range(1000):
_wn = np.random.normal(loc=0, scale=_std, size=1) # random white noise
_price = _price + _wn[0] # random price
df = df.append({'price':_price}, ignore_index=True)
df
I work with FOREX time series and I do not conceive time series without time so, just in case you have the same 'problem', I'm including a version with time stamp:
import numpy as np
import pandas as pd
from datetime import datetime
_price = 1.1300 # first price in the series
_std = 0.0005 # volatility (stadard deviation)
df = pd.DataFrame(columns=['price', 'time'])
for i in range(1000):
_wn = np.random.normal(loc=0, scale=_std, size=1) # random white noise
_price = _price + _wn[0] # random price
_time = datetime.now()
df = df.append({'price':_price, 'time':_time}, ignore_index=True)
df
Please let me know if this is what you needed.

pandas dataframe creating columns with loop

I'm trying to add new columns and fill them with data with for loops, take data from Price column and insert 1000 iterations into new dataframe column, after 1000 Price column iterations then make a new column for 1000 more, etc.
import pandas as pd
import matplotlib.pyplot as plt
data_frame = pd.read_csv('candle_data.csv', names=['Time', 'Symbol','Side', 'Size', 'Price','1','2','3','4','5'])
price_df = pd.DataFrame()
count_tick = 0
count_candle = 0
for price in data_frame['Price']:
if count_tick < 1000:
price_df[count_candle] = price
count_tick +=1
elif count_tick == 1000:
count_tick = 0
count_candle +=1
price_df.head()
It's not necessary that you loop through the data frame , you can use slicing to achieve this, look at below sample code. I have loaded a Dataframe with 100 rows and trying to create column -'col3' from first 50 rows of 'col1' and post that column 'col4' from the next 50 rows of 'col1'. You could modify the below code to point to your columns and the values that you want
import pandas as pd
import numpy as np
if __name__ == '__main__':
col1 = np.linspace(0,100,100)
col2 = np.linspace(100, 200, 100)
dict = {'col1':col1,'col2':col2}
df = pd.DataFrame(dict)
df['col3']= df['col1'][0:50]
df['col4'] = df['col1'][50:100]
print(df)
Solution 2 based on added info from comments
import pandas as pd
import numpy as np
if __name__ == '__main__':
pd.set_option('display.width', 100000)
pd.set_option('display.max_columns', 500)
### partition size for example I have taken a low volums 20
part_size = 20
## number generation for data frame
col1 = np.linspace(0,100,100)
col2 = np.linspace(100, 200, 100)
## create initial data frame
dict = {'col1':col1,'col2':col2}
df = pd.DataFrame(dict)
len = df.shape[0]
## tells you how many new columns you need
rec = int(len/part_size)
_ = {}
## initialize slicing variables
low =0
high=part_size
print(len)
for i in range(rec):
if high >= len:
_['col_name_here{0}'.format(i)] = df[low:]['col1']
break
else:
_['col_name_here{0}'.format(i)] = df[low:high]['col1']
low = high
high+= part_size
df = df.assign(**_)
print(df)

How to fill missing date in timeSeries

Here's what my data looks like:
There are daily records, except for a gap from 2017-06-12 to 2017-06-16.
df2['timestamp'] = pd.to_datetime(df['timestamp'])
df2['timestamp'] = df2['timestamp'].map(lambda x:
datetime.datetime.strftime(x,'%Y-%m-%d'))
df2 = df2.convert_objects(convert_numeric = True)
df2 = df2.groupby('timestamp', as_index = False).sum()
I need to fill this missing gap and others with values for all fields (e.g. timestamp, temperature, humidity, light, pressure, speed, battery_voltage, etc...).
How can I accomplish this with Pandas?
This is what I have done before
weektime = pd.date_range(start = '06/04/2017', end = '12/05/2017', freq = 'W-SUN')
df['week'] = 'nan'
df['weektemp'] = 'nan'
df['weekhumidity'] = 'nan'
df['weeklight'] = 'nan'
df['weekpressure'] = 'nan'
df['weekspeed'] = 'nan'
df['weekbattery_voltage'] = 'nan'
for i in range(0,len(weektime)):
df['week'][i+1] = weektime[i]
df['weektemp'][i+1] = df['temperature'].iloc[7*i+1:7*i+7].sum()
df['weekhumidity'][i+1] = df['humidity'].iloc[7*i+1:7*i+7].sum()
df['weeklight'][i+1] = df['light'].iloc[7*i+1:7*i+7].sum()
df['weekpressure'][i+1] = df['pressure'].iloc[7*i+1:7*i+7].sum()
df['weekspeed'][i+1] = df['speed'].iloc[7*i+1:7*i+7].sum()
df['weekbattery_voltage'][i+1] =
df['battery_voltage'].iloc[7*i+1:7*i+7].sum()
i = i + 1
The value of sum is not correct. Cause the value of 2017-06-17 is a sum of 2017-06-12 to 2017-06-16. I do not want to add them again. This gap is not only one gap in the period. I want to fill all of them.
Here is a function I wrote that might be helpful to you. It looks for inconsistent jumps in time and fills them in. After using this function, try using a linear interpolation function (pandas has a good one) to fill in your null data values. Note: Numpy arrays are much faster to iterate over and manipulate than Pandas dataframes, which is why I switch between the two.
import numpy as np
import pandas as pd
data_arr = np.array(your_df)
periodicity = 'daily'
def fill_gaps(data_arr, periodicity):
rows = data_arr.shape[0]
data_no_gaps = np.copy(data_arr) #avoid altering the thing you're iterating over
data_no_gaps_idx = 0
for row_idx in np.arange(1, rows): #iterate once for each row (except the first record; nothing to compare)
oldtimestamp_str = str(data_arr[row_idx-1, 0])
oldtimestamp = np.datetime64(oldtimestamp_str)
currenttimestamp_str = str(data_arr[row_idx, 0])
currenttimestamp = np.datetime64(currenttimestamp_str)
period = currenttimestamp - oldtimestamp
if period != np.timedelta64(900,'s') and period != np.timedelta64(3600,'s') and period != np.timedelta64(86400,'s'):
if periodicity == 'quarterly':
desired_period = 900
elif periodicity == 'hourly':
desired_period = 3600
elif periodicity == 'daily':
desired_period = 86400
periods_missing = int(period / np.timedelta64(desired_period,'s'))
for missing in np.arange(1, periods_missing):
new_time_orig = str(oldtimestamp + missing*(np.timedelta64(desired_period,'s')))
new_time = new_time_orig.replace('T', ' ')
data_no_gaps = np.insert(data_no_gaps, (data_no_gaps_idx + missing),
np.array((new_time, np.nan, np.nan, np.nan, np.nan, np.nan)), 0) # INSERT VALUES YOU WANT IN THE NEW ROW
data_no_gaps_idx += (periods_missing-1) #incriment the index (zero-based => -1) in accordance with added rows
data_no_gaps_idx += 1 #allow index to change as we iterate over original data array (main for loop)
#create a dataframe:
data_arr_no_gaps = pd.DataFrame(data=data_no_gaps, index=None,columns=['Time', 'temp', 'humidity', 'light', 'pressure', 'speed'])
return data_arr_no_gaps
Fill time gaps and nulls
Use the function below to ensure expected date sequence exists, and then use forward fill to fill in nulls.
import pandas as pd
import os
def fill_gaps_and_nulls(df, freq='1D'):
'''
General steps:
A) check for extra dates (out of expected frequency/sequence)
B) check for missing dates (based on expected frequency/sequence)
C) use forwardfill to fill nulls
D) use backwardfill to fill remaining nulls
E) append to file
'''
#rename the timestamp to 'date'
df.rename(columns={"timestamp": "date"})
#sort to make indexing faster
df = df.sort_values(by=['date'], inplace=False)
#create an artificial index of dates at frequency = freq, with the same beginning and ending as the original data
all_dates = pd.date_range(start=df.date.min(), end=df.date.max(), freq=freq)
#record column names
df_cols = df.columns
#delete ffill_df.csv so we can begin anew
try:
os.remove('ffill_df.csv')
except FileNotFoundError:
pass
#check for extra dates and/or dates out of order. print warning statement for log
extra_dates = set(df.date).difference(all_dates)
#if there are extra dates (outside of expected sequence/frequency), deal with them
if len(extra_dates) > 0:
#############################
#INSERT DESIRED BEHAVIOR HERE
print('WARNING: Extra date(s):\n\t{}\n\t Shifting highlighted date(s) back by 1 day'.format(extra_dates))
for date in extra_dates:
#shift extra dates back one day
df.date[df.date == date] = date - pd.Timedelta(days=1)
#############################
#check the artificial date index against df to identify missing gaps in time and fill them with nulls
gaps = all_dates.difference(set(df.date))
print('\n-------\nWARNING: Missing dates: {}\n-------\n'.format(gaps))
#if there are time gaps, deal with them
if len(gaps) > 0:
#initialize df of correct size, filled with nulls
gaps_df = pd.DataFrame(index=gaps, columns=df_cols.drop('date')) #len(index) sets number of rows
#give index a name
gaps_df.index.name = 'date'
#add the region and type
gaps_df.region = r
gaps_df.type = t
#remove that index so gaps_df and df are compatible
gaps_df.reset_index(inplace=True)
#append gaps_df to df
new_df = pd.concat([df, gaps_df])
#sort on date
new_df.sort_values(by='date', inplace=True)
#fill nulls
new_df.fillna(method='ffill', inplace=True)
new_df.fillna(method='bfill', inplace=True)
#append to file
new_df.to_csv('ffill_df.csv', mode='a', header=False, index=False)
return df_cols, regions, types, all_dates

Categories