Filtering and saving subset of pandas

Filtering and saving subset of pandas - python

I have a function that does the following:
Inserting class values 1,2,3 based on timestamps. This work as inspected and in the first iteration of the first for-loop i get the following class distribution:
mapping: {'Seizure': 1, 'Preictal': 2, 'Interictal': 3}
value counts:
3.0 3150000
2.0 450000
1.0 28000
Name: class, dtype:
So i have this number of rows for each class.
However in the second forloop i iterate through the same list of timestamps and want to subset the data between the timestamps and include some conditions based on the classes i inserted in first forloop.
This is the result of the same timestamps e.g. first iteration:
len sz: 28000
len prei: 450000
len pre int: 29700000
logging
len post int: 1485499
How the * does preint and post int (interictal class) get this high of a count? it doesn't at all correspond somewhat to the number interictal in the first?
here my function.
def insert_class_col(dataframe, sz_info_list, date_converter, save_filename, save_path, file_sample_rate, file_channel):
print(f"sz_info_list: {sz_info_list}")
if "class" not in dataframe.columns:
dataframe.insert(0, "class", np.nan)
file_channel.extend(['timestamp', 'class'])
dataframe = dataframe[file_channel]
# Insert class attributes to ensure that seizure, preictal, interictal does not overlap.
for index, container in enumerate(sz_info_list):
delay = container.delay * 1000
duration = container.duration * 1000
sz_start = date_converter(container.time_emu) + delay
sz_end = sz_start + duration
print(f"sz_start index = {sz_start}")
print(f"sz_end: {sz_end}")
preictal_start = sz_start - (15 * 60 * 1000)
interictal_start = sz_start - (1 * 60 * 60 * 1000)
interictal_end = sz_end + (1 * 60 * 60 * 1000)
dataframe['timestamp'] = pd.to_numeric(dataframe['timestamp'])
# hvis data er sezure tag seizure
# hvis data er preictal tag preictal/interictal, men ikke indenfor seizure data.
dataframe.loc[(dataframe['timestamp'] >= sz_start) & (dataframe['timestamp'] < sz_end), "class"] = class_mapping['Seizure']
dataframe.loc[(dataframe['class'] != class_mapping['Seizure']) & (dataframe['timestamp'] >= preictal_start) & (dataframe['timestamp'] < sz_start), "class"] = class_mapping['Preictal']
dataframe.loc[(dataframe['class'] != class_mapping['Seizure']) & (dataframe['class'] != class_mapping['Preictal']) & (dataframe['timestamp'] >= interictal_start) & (dataframe['timestamp'] < interictal_end), "class"] = class_mapping['Interictal']
print(f"mapping: {class_mapping} \n value counts: \n{dataframe['class'].value_counts()}")
print(f"Begginging current number of class in df {dataframe['class'].value_counts()}")
# Saving to csv
for index, container in enumerate(sz_info_list):
delay = container.delay * 1000
duration = container.duration * 1000
sz_start = date_converter(container.time_emu) + delay
sz_end = sz_start + duration
print(f"sz_start index = {sz_start}")
print(f"sz_end: {sz_end}")
preictal_start = sz_start - (15 * 60 * 1000)
interictal_start = sz_start - (1 * 60 * 60 * 1000)
interictal_end = sz_end + (1 * 60 * 60 * 1000)
dataframe['timestamp'] = pd.to_numeric(dataframe['timestamp'])
#INSERTING SEIZURE CLASS
sz_df = dataframe[(dataframe['timestamp'] >= sz_start) & (dataframe['timestamp'] < sz_end)].copy()
print(f"len sz: {len(sz_df)}")
#df_save_compress(f"Seizure_{index}_{save_filename}", save_path + "/Seizure", sz_df)
#logging_info_txt(f"Seizure_{index}_{save_filename}", save_path, file_sample_rate, file_channel)
#INSERTING PREICTAL
prei_df = dataframe[(dataframe['timestamp'] >= preictal_start) & (dataframe['timestamp'] < sz_start) & (dataframe['class'] != class_mapping["Seizure"])].copy()
print(f"len prei: {len(prei_df)}")
#df_save_compress(f"Preictal_{index}_{save_filename}", save_path + "/Preictal", prei_df)
#logging_info_txt(f"Preictal_{index}_{save_filename}", save_path, file_sample_rate, file_channel)
#INSERTING INTERICTAL
pre_int_df = dataframe[(dataframe['timestamp'] >= interictal_start) & (dataframe['timestamp'] < preictal_start) & (dataframe['class'] != class_mapping["Seizure"]) | (dataframe['class'] != class_mapping["Preictal"])].copy()
print(f"len pre int: {len(pre_int_df)}")
#df_save_compress(f"PreInt_{index}_{save_filename}", save_path + "/Interictal", pre_int_df)
logging_info_txt(f"PreInt_{index}_{save_filename}", save_path, file_sample_rate, file_channel)
post_int_df = dataframe[(dataframe['timestamp'] >= sz_end) & (dataframe['timestamp'] < interictal_end) & (dataframe['class'] != class_mapping["Seizure"]) & (dataframe['class'] != class_mapping["Preictal"])].copy()
print(f"len post int: {len(post_int_df)}")
#df_save_compress(f"PostInt_{index}_{save_filename}", save_path + "/Interictal", post_int_df)
logging_info_txt(f"PostInt_{index}_{save_filename}", save_path, file_sample_rate, file_channel)
#print(f"after = len df: {len(dataframe)} values class: \n {dataframe['class'].value_counts()}")
# clean up
del pre_int_df, post_int_df, sz_df, prei_df
gc.collect()
Notice that preint which is interictal is 29700000 while printing the classes i should be lower than 3150000.
Any ideas of this pandas behavior?

#richardec answered the question see comments.

Related

Pivot function results differ from TradingView

I'm using this code to calculate pivot points.
def pivots_low(osc, LBR, LBL):
pivots = []
for i in range(len(osc) - LBR):
pivots.append(0)
pivot = True
if i > LBL:
for j in range(1, LBR + 1):
if osc[i] >= osc[i + j]:
pivot = False
for j in range(1, LBL + 1):
if osc[i] > osc[i - j]:
pivot = False
if pivot is True:
pivots[len(pivots) - 1] = osc[i]
for i in range(LBR):
pivots.append(0)
return pivots
This returns an array with 0's where there's no pivots and the value of the pivot if there is one.
When Comparing the results to TradingView (downloaded csv with pivot points), the only time it matches exactly is when lookback left and right are both 5. Otherwise it deviates in the number of total pivots and the location of some.
But using this code to calculate pivot highs:
def pivots_high(osc, LBR, LBL):
pivots = []
for i in range(len(osc)-LBR):
pivots.append(0)
pivot = True
if i > LBL:
for j in range(1,LBL + 1):
if osc[i] < osc[i-j]:
pivot = False
for j in range(1,LBR + 1):
if osc[i] <= osc[i+j]:
pivot = False
if pivot is True:
pivots[len(pivots)-1] = osc[i]
for i in range(LBR):
pivots.append(0)
return pivots
the results are perfect regardless of lookback values. But the code is almost exactly the same besides comparison.
What is going wrong here? This is day 3 of having this problem and I just cant fix it
To Reproduce:
Load Data:
Full_Data = pd.read_csv(file)
use this simple function to check matches between calculated pivots and TradingView pivots.
def match_pivs(data, pivs_h, pivs_l): //Data is a DataFrame loaded from tradingview csv
global lblh
global lbrh
global lbll
global lbrl
start = lbrh
if lbrl > lbrh:
start = lbrl
match_h = 0
tot_hd = 0
tot_hp = 0
match_l = 0
tot_ld = 0
tot_lp = 0
for i in range(start, len(data)):
if data['PivHigh'][i] != 0 and pivs_h[i-lbrh] != 0:
match_h += 1
if data['PivLow'][i] != 0 and pivs_l[i-lbrl] != 0:
match_l += 1
if data['PivHigh'][i] != 0:
tot_hd += 1
if data['PivLow'][i] != 0:
tot_ld += 1
if pivs_h[i] != 0:
tot_hp += 1
if pivs_l[i] != 0:
tot_lp += 1
print('PivsLow ' + str(tot_lp))
print('DataLows ' + str(tot_ld))
print('MatchesL ' + str(match_l))
print('PivsHigh ' + str(tot_hp))
print('DataHighs ' + str(tot_hd))
print('MatchesH ' + str(match_h))
and to get csv from TradingView:
//#version=5
indicator("Data Script", overlay=true, max_labels_count=500)
leftLenL = input.int(title="Pivot Low", defval=10, minval=1, inline="Pivot Low", group=lengthGroupTitle)
rightLenL = input.int(title="/", defval=10, minval=1, inline="Pivot Low", group=lengthGroupTitle)
leftLenH = input.int(title="Pivot High", defval=10, minval=1, inline="Pivot High", group=lengthGroupTitle)
rightLenH = input.int(title="/", defval=10, minval=1, inline="Pivot High", group=lengthGroupTitle)
ph = ta.pivothigh(leftLenH, rightLenH)
pl = ta.pivotlow(leftLenL, rightLenL)
if not na(ph)
plth := ph
else
plth := 0.0
if not na(pl)
pltl := pl
else
pltl := 0.0
plot(plth, 'PivHigh')
plot(pltl, 'PivLow')
then just download csv with this script loaded.
Run program with these three lines:
pl = pivots_low(Full_Data['low'], lbll, lbrl)
ph = pivots_high(Full_Data['high'], lbrh, lblh)
match_pivs(Full_Data, ph, pl)

Finally found a way.
I still have no idea why that code does not work but I've made a different way that seems to be doing the job 100% to tradingview data.
def checkhl(data_back, data_forward, hl):
if hl == 'high' or hl == 'High':
ref = data_back[len(data_back)-1]
for i in range(len(data_back)-1):
if ref < data_back[i]:
return 0
for i in range(len(data_forward)):
if ref <= data_forward[i]:
return 0
return 1
if hl == 'low' or hl == 'Low':
ref = data_back[len(data_back)-1]
for i in range(len(data_back)-1):
if ref > data_back[i]:
return 0
for i in range(len(data_forward)):
if ref >= data_forward[i]:
return 0
return 1
def pivot(osc, LBL, LBR, highlow)
left = []
right = []
for i in range(len(osc)):
pivots.append(0.0)
if i < LBL + 1:
left.append(osc[i])
if i > LBL:
right.append(osc[i])
if i > LBL + LBR:
left.append(right[0])
left.pop(0)
right.pop(0)
if checkhl(left, right, highlow):
pivots[i - LBR] = osc[i - LBR]
return pivots
then just do:
pivots_low = pivot(data, lbl, lbr, 'low')
pivots_high = pivot(data, lbl, lbr, 'high')
All the pivots will be in the actual position that they occur, not lbr bars after, otherwise the value will be 0.0
I'm not sure if this is efficient or not but it seems to work.

Python - While trying to calculate RSI(Relative strength index - stock indicator) my results are "upside down" and shifted

I am trying to calculate RSI using simple functions.
The general formula for it is:
RSI = 100/(1+RS), where RS = Exponential Moving Average of gains / -||- of losses.
Here is what I am getting:
enter image description here
Here it is how should it look like:
enter image description here
I have everything double checked or even triple checked, but I can't find any mistake.
Thus I need your help, I know that the question is very simple though I need some help, I have no idea where I have made the mistake.
The general idea of RSI is that it should be low where the price is "low" and high, where the price is high, and generally no matter what I try I have it upside down.
def EMA(close_price_arr, n):
a = (2/n + 1)
EMA_n = np.empty((1, len(close_price_arr)))
for i in range(len(close_price_arr)):
if i < n:
# creating NaN values where it is impossible to calculate EMA to drop it later after connecting the whole database
EMA_n[0, i] = 'NaN'
if i >= n:
# Calaculating nominator and denominator of EMA
for j in range(n):
nominator_ema += close_price_arr[i - j] * a**(j)
denominator_ema += a**(j)
EMA_n[0, i] = nominator_ema / denominator_ema
nominator_ema = 0
denominator_ema = 0
return EMA_n
def gains(close_price_arr):
gain_arr = np.empty((len(close_price_arr) - 1))
for i in range(len(close_price_arr)):
if i == 0:
pass
if i >= 1:
if close_price_arr[i] > close_price_arr[i - 1]:
gain_arr[i - 1] = (close_price_arr[i] - close_price_arr[i-1])
else:
gain_arr[i - 1] = 0
return gain_arr
def losses(close_price_arr):
loss_arr = np.empty((len(close_price_arr) - 1))
for i in range(len(close_price_arr)):
if i == 0:
pass
if i >= 1:
if close_price_arr[i] < close_price_arr[i - 1]:
loss_arr[i - 1] = abs(close_price_arr[i] - close_price_arr[i - 1])
else:
loss_arr[i - 1] = 0
return loss_arr
def RSI(gain_arr, loss_arr, n):
EMA_u = EMA(gain_arr, n)
EMA_d = EMA(loss_arr, n)
EMA_diff = EMA_u / EMA_d
x,y = EMA_diff.shape
print(x, y)
RSI_n = np.empty((1, y))
for i in range(y):
if EMA_diff[0, i] == 'NaN':
RSI_n[0, i] = 'NaN'
print(i)
else:
RSI_n[0, i] = 100 / (1 + EMA_diff[0, i])
return RSI_n
#contextmanager
def show_complete_array():
oldoptions = np.get_printoptions()
np.set_printoptions(threshold=np.inf)
try:
yield
finally:
np.set_printoptions(**oldoptions)
np.set_printoptions(linewidth=3000)
pd.set_option('display.max_columns', None)
# Specyfying root folder, file folder and file
FILE = 'TVC_SILVER, 5.csv'
FOLDER = 'src'
PROJECT_ROOT_DIR = '.'
csv_path = os.path.join(PROJECT_ROOT_DIR, FOLDER, FILE)
# reading csv
price_data = pd.read_csv(csv_path, delimiter=',')
price_data_copy = price_data.copy()
price_data_nodate = price_data.copy().drop('time', axis=1)
price_data_np = price_data_nodate.to_numpy(dtype='float32')
close_price = price_data_np[:, 3]
EMA15 = EMA(close_price_arr=close_price, n=15)
EMA55 = EMA(close_price_arr=close_price, n=55)
gain = gains(close_price_arr=close_price)
loss = losses(close_price_arr=close_price)
RSI14 = RSI(gain_arr=gain, loss_arr=loss, n=14)

Try this:
"""dataset is a dataframe"""
def RSI(dataset, n=14):
delta = dataset.diff()
dUp, dDown = delta.copy(), delta.copy()
dUp[dUp < 0] = 0
dDown[dDown > 0] = 0
RolUp = pd.Series(dUp).rolling(window=n).mean()
RolDown = pd.Series(dDown).rolling(window=n).mean().abs()
RS = RolUp / RolDown
rsi= 100.0 - (100.0 / (1.0 + RS))
return rsi

append answer from pcb in while loop

upper_bound = 0x1200
lower_bound = 0x0
msg_to_send = rcvD.all_strck.MxFEAxiRegMsg.copy()
modem_snr_list = []
while True:
modem_snr_list.clear()
running_value = (upper_bound + lower_bound) // 2
msg_to_send["data"] = running_value
# rcvD.send_the_message("MxFEAxiRegMsg", rcvD.all_strck.MxFEAxiRegMsg)
rcvD.send_the_message("MxFEAxiRegMsg", msg_to_send)
time.sleep(2)
fm.rcv_the_packets(wait_for_optcode=38)
fm.rcv_the_packets(wait_for_optcode=90)
modem_sync = rcvD.all_strck.modemParamMsg["modemSync"]
modem_freq = rcvD.all_strck.modemParamMsg["modemEstFreq"]
modem_snr = rcvD.all_strck.modemParamMsg["modemSnr"]
modem_snr_list.append(modem_snr)
average_snr = sum(modem_snr_list) / len(modem_snr_list)
print(f"modem snr list is {modem_snr_list}\n modem snr average is {average_snr}")
sent_pack = rcvD.all_strck.dataStatusRepMsg["sentMsgCnt"]
receive_pack = rcvD.all_strck.dataStatusRepMsg["rcvMsgCnt"]
print(
f"msg_sent: {msg_to_send} \n running_value: {running_value}\n upper_bound: {upper_bound}\n lower_bound: {lower_bound}\n modem_snr: {modem_snr}\n modem_freq: {modem_freq}\n modem_sync: {modem_sync}\n "
f"sent_packets: {sent_pack}\n receive_packets: {receive_pack}")
if -1 < modem_snr < 1 and modem_sync and modem_freq < 1000 and sent_pack == receive_pack:
break
if modem_snr < -1:
lower_bound = running_value + 1
if modem_snr > 1:
upper_bound = running_value - 1
if upper_bound < lower_bound:
print("FAIL")
exit(1)
Hi all, I need to catch the answer from PCB(opcode 38) because of timing I prefer to send a data(msg_sent) than to do some list that appends received data, calculate the average, and after all this proceed to the if statements(instead of modem_snr will be average_snr), after that if I need to back to while to fix data, I will need to clear the previous list and repeat the circle.

python data frame filter conditions: any faster way

parts_list = imp_parts_df['Parts'].tolist()
sub_week_list = ['2016-12-11', '2016-12-04', '2016-11-27', '2016-11-20', '2016-11-13']
i = 0
start = DT.datetime.now()
for p in parts_list:
for thisdate in sub_week_list:
thisweek_start = pd.to_datetime(thisdate, format='%Y-%m-%d') #'2016/12/11'
thisweek_end = thisweek_start + DT.timedelta(days=7) # add 7 days to the week date
val_shipped = len(shipment_df[(shipment_df['loc'] == 'USW1') & (shipment_df['part'] == str(p)) & (shipment_df['shipped_date'] >= thisweek_start) & (shipment_df['shipped_date'] < thisweek_end)])
print(DT.datetime.now() - start).total_seconds()
shipment_df has around 35000 records
partlist has 436 parts
sub_week_list has 5 dates in it
it took overall 438.13 secs to run this code
Is there any faster way to do it?

parts_list = imp_parts_df['Parts'].astype(str).tolist()
i = 0
start = DT.datetime.now()
for p in parts_list:
q = 'loc == "xxx" & part == #p & "2016-11-20" <= shipped_date < "2016-11-27"'
val_shipped = len(shipment_df.query(q))
print (DT.datetime.now() - start).total_seconds()

How to put a space between two outputs when time delay is used in Python?

Currently I am using a barometric sensor with a raspberry pi. I am using a time delay and my code looks something like this,
import smbus
import time
while True:
try:
# Get I2C bus
bus = smbus.SMBus(1)
# BMP280 address, 0x76(118)
# Read data back from 0x88(136), 24 bytes
b1 = bus.read_i2c_block_data(0x76, 0x88, 24)
# Convert the data
# Temp coefficents
dig_T1 = b1[1] * 256 + b1[0]
dig_T2 = b1[3] * 256 + b1[2]
if dig_T2 > 32767 :
dig_T2 -= 65536
dig_T3 = b1[5] * 256 + b1[4]
if dig_T3 > 32767 :
dig_T3 -= 65536
# Pressure coefficents
dig_P1 = b1[7] * 256 + b1[6]
dig_P2 = b1[9] * 256 + b1[8]
if dig_P2 > 32767 :
dig_P2 -= 65536
dig_P3 = b1[11] * 256 + b1[10]
if dig_P3 > 32767 :
dig_P3 -= 65536
dig_P4 = b1[13] * 256 + b1[12]
if dig_P4 > 32767 :
dig_P4 -= 65536
dig_P5 = b1[15] * 256 + b1[14]
if dig_P5 > 32767 :
dig_P5 -= 65536
dig_P6 = b1[17] * 256 + b1[16]
if dig_P6 > 32767 :
dig_P6 -= 65536
dig_P7 = b1[19] * 256 + b1[18]
if dig_P7 > 32767 :
dig_P7 -= 65536
dig_P8 = b1[21] * 256 + b1[20]
if dig_P8 > 32767 :
dig_P8 -= 65536
dig_P9 = b1[23] * 256 + b1[22]
if dig_P9 > 32767 :
dig_P9 -= 65536
# BMP280 address, 0x76(118)
# Select Control measurement register, 0xF4(244)
# 0x27(39) Pressure and Temperature Oversampling rate = 1
# Normal mode
bus.write_byte_data(0x76, 0xF4, 0x27)
# BMP280 address, 0x76(118)
# Select Configuration register, 0xF5(245)
# 0xA0(00) Stand_by time = 1000 ms
bus.write_byte_data(0x76, 0xF5, 0xA0)
time.sleep(0.5)
# BMP280 address, 0x76(118)
# Read data back from 0xF7(247), 8 bytes
# Pressure MSB, Pressure LSB, Pressure xLSB, Temperature MSB, Temperature LSB
# Temperature xLSB, Humidity MSB, Humidity LSB
data = bus.read_i2c_block_data(0x76, 0xF7, 8)
# Convert pressure and temperature data to 19-bits
adc_p = ((data[0] * 65536) + (data[1] * 256) + (data[2] & 0xF0)) / 16
adc_t = ((data[3] * 65536) + (data[4] * 256) + (data[5] & 0xF0)) / 16
# Temperature offset calculations
var1 = ((adc_t) / 16384.0 - (dig_T1) / 1024.0) * (dig_T2)
var2 = (((adc_t) / 131072.0 - (dig_T1) / 8192.0) * ((adc_t)/131072.0 - (dig_T1)/8192.0)) * (dig_T3)
t_fine = (var1 + var2)
cTemp = (var1 + var2) / 5120.0
fTemp = cTemp * 1.8 + 32
# Pressure offset calculations
var1 = (t_fine / 2.0) - 64000.0
var2 = var1 * var1 * (dig_P6) / 32768.0
var2 = var2 + var1 * (dig_P5) * 2.0
var2 = (var2 / 4.0) + ((dig_P4) * 65536.0)
var1 = ((dig_P3) * var1 * var1 / 524288.0 + ( dig_P2) * var1) / 524288.0
var1 = (1.0 + var1 / 32768.0) * (dig_P1)
p = 1048576.0 - adc_p
p = (p - (var2 / 4096.0)) * 6250.0 / var1
var1 = (dig_P9) * p * p / 2147483648.0
var2 = p * (dig_P8) / 32768.0
pressure = (p + (var1 + var2 + (dig_P7)) / 16.0) / 100
# Output data to screen
print "Temperature in Celsius : %.2f C" %cTemp
print "Temperature in Fahrenheit : %.2f F" %fTemp
print "Pressure : %.2f hPa " %pressure
# add a short sleep here at the end...
sleep(1)
except KeyboardInterrupt:
# quit
sys.exit()
The above code gives me outputs of temperature in Celsius, temperature in Fahrenheit and pressure with a delay of one second in the following format.
Temp in C
Temp in F
Pressure
Temp in C
Temp in F
Pressure
Temp in C
Temp in F
Pressure
I want to put a gap (and I want to control this gap) between each sets of outputs when I use a loop and a time delay in this case (something like a line break). My output should look something like this,
Temp in C
Temp in F
Pressure
Temp in C
Temp in F
Pressure
Temp in C
Temp in F
Pressure
I can understand that by putting print (") will give a line space but is there any way to adjust the size of this line space? For example if the line space is of 1 cm, then how can we change it/ make it to 0.5 cm or 2 cm? Any guidance will be helpful.
Thanks.

you can print empty line right before or after sleep(1) print()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Filtering and saving subset of pandas - python

#richardec answered the question see comments.

Related

Pivot function results differ from TradingView

Python - While trying to calculate RSI(Relative strength index - stock indicator) my results are "upside down" and shifted

append answer from pcb in while loop

python data frame filter conditions: any faster way

How to put a space between two outputs when time delay is used in Python?

Categories

Resources