How can i create an input for choosing different files to access? - python

I am quite new to python so please bear with me.
Currently, this is my code:
import pandas as pd
import statistics
import matplotlib.pyplot as plt
import math
from datetime import datetime
start_time = datetime.now()
gf = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/exfileCLEAN2.csv",
skiprows=[1])
bf = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/2SeconddatasetCLEAN.csv",
skiprows=[1])
df = (input("Which data set? "))
magnitudes = (df['Magnitude '].values)
times = df['Time '].values
average = statistics.mean(magnitudes)
sd = statistics.stdev(magnitudes)
below = sd * 3
class data_set:
def __init__(self, index):
self.mags = []
self.i = index
self.mid_time = df['Time '][index]
self.mid_mag = df['Magnitude '][index]
self.times = []
ran = 80
for ii in range(ran):
self.times.append(df['Time '][self.i + ii - ran / 2])
self.mags.append(df['Magnitude '][self.i + ii - ran / 2])
data = []
today = float(input("What is the range? "))
i = 0
while (i < len(df['Magnitude '])):
if (abs(df['Magnitude '][i]) <= (average - below)):
# check if neighbours
t = df['Time '][i]
tt = True
for d in range(len(data)):
if abs(t - data[d].mid_time) <= today:
# check if closer to center
if df['Magnitude '][i] < data[d].mid_mag:
data[d] = data_set(i)
print("here")
tt = False
break
if tt:
data.append(data_set(i))
i += 1
print("found values")
# graphing
height = 2 # Change this for number of columns
width = math.ceil(len(data) / height)
if width < 2:
width = 2
fig, axes = plt.subplots(width, height, figsize=(30, 30))
row = 0
col = 0
for i in range(len(data)):
axes[row][col].plot(data[i].times, data[i].mags)
col += 1
if col > height - 1:
col = 0
row += 1
plt.show()
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
Currently, the error produced is this:
/Users/aaronhuang/.conda/envs/EXTTEst/bin/python "/Users/aaronhuang/PycharmProjects/EXTTEst/Code sandbox.py"
Which data set? gf
Traceback (most recent call last):
File "/Users/aaronhuang/PycharmProjects/EXTTEst/Code sandbox.py", line 14, in <module>
magnitudes = int(df['Magnitude '].values)
TypeError: string indices must be integers
Process finished with exit code 1
I am trying to have the user be able to choose which file to access to perform the rest of the code on.
So if the user types gf I would like the code to access the first data file.
Any help would be appreciated. Thank you

Why not use an if-statement at the beginning? Try this:
instead of:
gf = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/exfileCLEAN2.csv",
skiprows=[1])
bf = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/2SeconddatasetCLEAN.csv",
skiprows=[1])
df = (input("Which data set? "))
Use this:
choice = input("Which data set? ")
if choice == "gf":
df = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/exfileCLEAN2.csv",
skiprows=[1])
elif choice == "bf":
df = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/2SeconddatasetCLEAN.csv",
skiprows=[1])
else:
print("Error. Your choice is not valid")
df = ""
break

Related

RuntimeError: input(): lost sys.stdin from executable file

Seeking your help regarding this executable file I have converted from my .py file using auto-py-to-exe. I have this code below that I made for my csv automation report. Looks fine when running on IDE and CMD but when I tried to convert it to .exe this what happens.
Traceback (most recent call last):
File "new.py", line 7, in <module>
input_file = input("Enter the file name of your HC file: ")
RuntimeError: input(): lost sys.stdin
Here is my code below for your reference. Hoping you could help me with this issue.
import pandas as pd
import numpy as np
print("Fixed Network Health Check Checker")
input_file = input("Enter the file name of your HC file: ")
file = input_file + str('.xlsx')
df = pd.read_excel(file, sheet_name = 'MSAN Cabinets')
print("Done")
#fixed
df['MSAN Interface'] = df['MSAN Interface'].replace(np.nan, 0)
df['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] = df['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'].replace(np.nan, 0)
df['Homing AG2'] = df['Homing AG1'].replace(np.nan, 0)
df = df.iloc[:11255]
# filter "REGION" and drop unnecessary columns
f_df1 = df[df['REGION'] == 'MIN']
dropcols_df1 = f_df1.drop(df.iloc[:, 1:6], axis = 1)
dropcols_df2 = dropcols_df1.drop(df.iloc[:, 22:27], axis = 1)
dropcols_df3 = dropcols_df2.drop(df.iloc[:, 37:50], axis = 1)
# filter "MSAN Interface" and filter the peak util for >= 50%
f_d2 = dropcols_df3['MSAN Interface'] != 0
msan_int = dropcols_df3[f_d2]
f_msan_int = msan_int['Peak Util'] >= 0.5
new_df = msan_int[f_msan_int]
# filter "ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)" and filter the peak util for >= 50%
fblank_msan_int = dropcols_df3['MSAN Interface'] == 0
msan_int1 = dropcols_df3[fblank_msan_int]
f_df3 = dropcols_df3['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] != 0
access_int1 = dropcols_df3[f_df3]
f_access_int1 = access_int1['Peak Util.1'] >= 0.5
new_df1 = access_int1[f_access_int1]
# filter "Homing AG1" and filter the peak util for >= 50%
fblank_msan_int1 = dropcols_df3['MSAN Interface'] == 0
msan_int2 = dropcols_df3[fblank_msan_int1]
f_access_int2 = msan_int2['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] == 0
new_df2 = msan_int2[f_access_int2]
ag1 = new_df2['Peak Util.3'] >= 0.5
new_df3 = new_df2[ag1]
# Concatenate all DataFrames
pdList = [new_df, new_df1, new_df3]
final_df = pd.concat(pdList)
print(final_df.to_csv('output.csv', index = False))
Thank you. Btw I'm new in Python :).

Python Pandas Dataframe assigning target weights

I have three pandas dataframes with each two columns named the same. I would like to compare them and replace the given values with target weights. 1 (buy) and -1 (sell), respectively if conditions dont meet the previous value should be copied.
Below error code displayed on the first if line: ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
1st try:
target_weights[(sma_short_int > sma_long_int) & (stock_rsi_int < rsi_buy)] = 1 target_weights[(sma_short_int <= sma_long_int) & (stock_rsi_int < rsi_buy)] = -1
target_weights = target_weights.shift(-1)
Output: all numbers should be respectively replaced with 1 or -1 and if a buying or selling condition isn't met then the previous number should be copied Output 1st stry
Second try:
signal = 0
for i in target_weights:
if target_weights[sma_short > sma_long]:
if target_weights[stock_rsi_int < rsi_buy_df]:
signal = 1/ticker_count
target_weights = 1
else:
signal = signal
elif target_weights[sma_short <= sma_long]:
if target_weights[stock_rsi_int < rsi_sell_df]:
target_weights = -1
signal = -(1/ticker_count)
else:
signal = signal
else:
signal = signal
Current code:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 19 10:33:02 2022
#author: ericberner
"""
import bt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import talib
tickers = 'EURUSD=X, USDCAD=X'
result_df = pd.DataFrame(columns = ['SMA_Short','SMA_Long','RSI_Buy','RSI_Sell','CAGR','daily_sharpe'])
stock_rsi = pd.DataFrame()
# Defining strategy
def strategy(data = pd.DataFrame(),
short_sma = 50,
long_sma = 200,
rsi_buy = 20,
rsi_sell = 60,
name = 'RSI and SMA',
start='2020-1-1',
end ='2021-6-1'):
# Getting tickers
data = bt.get(tickers, start=start, end=end)
# Calculating SMAs
sma_short = data.rolling(short_sma).mean()
sma_long = data.rolling(long_sma).mean()
# Calculating RSI
for i in data:
stock_rsi[i] = talib.RSI(data[i], timeperiod=14)
# Counting the number of tickers
ticker_count = len(data.columns)
# Initializing signal
# signal = data.copy()
# signal['signal'] = 0
# signal = signal[['signal']]
# signal
target_weights = sma_long.copy()
target_weights = target_weights.iloc[170:]
target_weights = target_weights.astype(float)
stock_rsi_int = stock_rsi.iloc[170:]
stock_rsi_int = stock_rsi_int.astype(float)
rsi_buy_int = float(rsi_buy)
rsi_sell_int = float(rsi_sell)
sma_long_int = sma_long.iloc[170:]
sma_short_int = sma_short.iloc[170:]
sma_long_int = sma_long_int.astype(float)
sma_short_int = sma_short_int.astype(float)
#creating another df for rsi_buy and sell for comparison
rsi_buy_df = sma_long_int.copy()
rsi_buy_df[rsi_buy_df > 0] = rsi_buy_int
#rsi_buy_df = rsi_buy_df.replace(rsi_buy_int, inplace=True)
rsi_sell_df = sma_long_int.copy()
rsi_sell_df[rsi_sell_df > 0] = rsi_sell_int
# Counting the number of tickers
# Setting target weights
target_weights = sma_long_int.copy()
# for i in target_weights:
for i in target_weights:
target_weights[(sma_short_int > sma_long_int) & (stock_rsi_int < rsi_buy)] = 1
target_weights[(sma_short_int <= sma_long_int) & (stock_rsi_int < rsi_buy)] = -1
target_weights = target_weights.shift(-1)
print(target_weights)
# if target_weights[(sma_short > sma_long)] & target_weights[(stock_rsi < rsi_buy)]:
# target_weights['signal'] = 1/ticker_count
# elif target_weights[(sma_short <= sma_long)] & target_weights[(stock_rsi > rsi_sell)]:
# target_weights['signal'] = -(1/ticker_count)
# else:
# target_weights['signal'] = target_weights['signal']
# Creating strategy
strategy = bt.Strategy(name,
[bt.algos.WeighTarget(target_weights),
bt.algos.Rebalance()])
# Outputing the Backtest object
return bt.Backtest(strategy, data)
# Creating a loop to test optimal parameters
# Short SMA
for short_number in range(20,25,1):
short_name = 'sma'+ str(short_number)
# Long SMA
for long_number in range(150,160,5):
long_name = 'sma'+ str(long_number)
# Buy signal RSI
for buy_rsi in range(20,35,5):
buy_name = 'buy' + str(buy_rsi)
# Sell signal RSI
for sell_rsi in range(65,85,5):
sell_name = 'sell' + str(sell_rsi)
# Using the ma_crossover function
result = bt.run(strategy(tickers, short_sma = short_number, long_sma = long_number, rsi_buy = buy_rsi, rsi_sell = sell_rsi, name = short_name))
result_df = result_df.append({'SMA_Short':short_number,'SMA_Long':long_number,'RSI_Buy':buy_rsi,'RSI_Sell':sell_rsi,'CAGR':result.stats.at['cagr',short_name],
'daily_sharpe': result.stats.at['daily_sharpe',short_name]},ignore_index=True)

VPN Indicator ThinkScript to Python

Taking a stab at converting a ThinkScript to Python for the first time, and I think my logic is right, but I am missing something as the two plots for the indicator don't match.
Trying to convert the ThinkScript for the VPNIndicator to a Python implementation. Looking for someone knowledgeable in both languages to contribute here.
To start, the indicator plot in ThinkorSwim looks like this (bottom):
So I'm trying to replicate that plot using matplotlib finance, but first I need to translate from ThinkScript to Python, which I've attempted here:
import mplfinance as mpf
import pandas as pd
import numpy as np
import talib
def VPN_Indicator(df, params):
# def atr = WildersAverage(TrueRange(high, close, low), length);
df['H-L'] = df['High'] - df['Low']
df['H-C1'] = df['High'] - df['Close'].shift()
df['C1-L'] = df['Close'].shift() - df['Low']
df['TrueRange'] = df[['H-L','H-C1','C1-L']].max(axis=1)
df['WildersATR'] = df['TrueRange'].ewm(alpha=1.0 / params['length'], adjust=False).mean()
# def diff = hlc3 - hlc3[1];
df['Diff'] = ((df['High'] + df['Low'] + df['Close']) / 3) - ((df['High'].shift() + df['Low'].shift() + df['Close'].shift()) / 3) # Forward peak here?
# def vp = Sum(if diff > factor * atr then volume else 0, length);
df['VP_Helper'] = np.where(df['Diff'] > params['factor'] * df['WildersATR'], df['Volume'], 0)
df['VP'] = df['VP_Helper'].rolling(params['length']).sum()
# def vn = Sum(if diff < -factor * atr then volume else 0, length);
df['VN_Helper'] = np.where(df['Diff'] < -params['factor'] * df['WildersATR'], df['Volume'], 0)
df['VN'] = df['VN_Helper'].rolling(params['length']).sum()
# plot VPN = ExpAverage(100 * (vp - vn) / Sum(volume, length), emaLength);
df['RollingVol'] = df['Volume'].rolling(params['length']).sum()
df['VPN'] = talib.EMA(100 * (df['VP'] - df['VN']) / df['RollingVol'], timeperiod=params['emaLength'])
# plot VPNAvg = MovingAverage(averageType, VPN, averageLength);
if params['averageType'] in ['simple','sma','SMA','SIMPLE']:
df['VPNAvg'] = talib.SMA(df['VPN'], timeperiod=params['averageLength'])
# plot CriticalLevel = criticalValue;
df['CriticalLevel'] = params['criticalValue']
# VPN.DefineColor("Above", Color.UPTICK);
# VPN.DefineColor("Below", Color.DOWNTICK);
# VPN.AssignValueColor(if VPN > CriticalLevel then VPN.Color("Above") else VPN.Color("Below"));
# VPNAvg.SetDefaultColor(GetColor(7));
# CriticalLevel.SetDefaultColor(GetColor(1));
# Gimicks, don't need the top bit for now
return df
params = {
"length": 30,
"emaLength": 3,
"averageLength": 30,
"factor": 0.1,
"criticalValue": 10,
"averageType": "simple"
}
# Import a 1min dataset and rename columns as necessary
df = pd.read_csv("SPY.csv").iloc[-2000:,:]
df['time'] = pd.to_datetime(df['time'])
df = df.set_index('time')
df = df.rename(columns={'open':'Open', 'high':'High', 'low':"Low", "close": "Close", "volume": "Volume"})
df = VPN_Indicator(df, params)
# Plot the results
apds = [ mpf.make_addplot((df['CriticalLevel']), panel=2, color='g'),
mpf.make_addplot((df['VPN']), panel=2, color='g'),
mpf.make_addplot((df['VPNAvg']), panel=2, color='g'),
]
mpf.plot(df[['Open', 'High', 'Low', 'Close', 'Volume']], addplot=apds, figscale=1.2, volume=True)
... which results in a plot that looks like this:
... which is close, but the peaks don't line up with the ThinkOrSwim plot. So I'm wanting to know from someone who knows these languages where I might be off? Thanks!
Try using this to calculate ATR. This gives the same output as TOS.
import numpy as np
def ema(arr, periods=14, weight=1, init=None):
leading_na = np.where(~np.isnan(arr))[0][0]
arr = arr[leading_na:]
alpha = weight / (periods + (weight-1))
alpha_rev = 1 - alpha
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
out1 = np.array([])
if 0 in pows:
out1 = ema(arr[:int(len(arr)/2)], periods)
arr = arr[int(len(arr)/2) - 1:]
init = out1[-1]
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
scale_arr = 1/pows[:-1]
if init:
offset = init * pows[1:]
else:
offset = arr[0]*pows[1:]
pw0 = alpha*alpha_rev**(n-1)
mult = arr*pw0*scale_arr
cumsums = mult.cumsum()
out = offset + cumsums*scale_arr[::-1]
out = out[1:] if len(out1) > 0 else out
out = np.concatenate([out1, out])
out[:periods] = np.nan
out = np.concatenate(([np.nan]*leading_na, out))
return out
def atr(highs, lows, closes, periods=14, ema_weight=1):
hi = np.array(highs)
lo = np.array(lows)
c = np.array(closes)
tr = np.vstack([np.abs(hi[1:]-c[:-1]),
np.abs(lo[1:]-c[:-1]),
(hi-lo)[1:]]).max(axis=0)
atr = ema(tr, periods=periods, weight=ema_weight)
atr = np.concatenate([[np.nan], atr])
return atr

How to fix for loop in Python Visio Converter

I am trying to run a for loop but have it stop when it gets to a certain date, and it does this, however, it prints out multiple shapes on top of each other, when I only want 1. The program somehow prints the number of shapes based on the row number it is in Excel. Not sure how to fix this, any help would be appreciated.
from PIL import Image, ImageDraw, ImageFont
import win32com.client
from win32com.client import constants as vis
app = win32com.client.gencache.EnsureDispatch( 'Visio.Application' )
current = datetime.datetime(*xlrd.xldate_as_tuple(sheet3.cell_value(7,9), wb.datemode))
currentDate = current.strftime('%m/%d')
dateList = []
for row in range(1,sheet3.nrows):
if sheet3.cell_value(row,13) == "":
continue
date = datetime.datetime(*xlrd.xldate_as_tuple(sheet3.cell_value(row,13), wb.datemode))
dateList.append(date.strftime('%m/%d'))
for date in dateList:
x1 = sheet3.cell_value(row,14)
x2 = sheet3.cell_value(row,15)
y1 = sheet3.cell_value(row,16)
y2 = sheet3.cell_value(row,17)
borderColor = 0
borderType = 0
colorValue = sheet3.cell_value(9,10)
colorFunc(x1,y1,x2,y2)
shape.Cells('FillforegndTrans').FormulaU = sheet3.cell_value(7,10)
if currentDate == date:
break
I have figured it out. Instead of having a for loop, I need to just state at the end an if statement, if the currentDate is in the dateList, then break.
dateList = []
for row in range(1,sheet3.nrows):
if sheet3.cell_value(row,13) == "":
continue
date = datetime.datetime(*xlrd.xldate_as_tuple(sheet3.cell_value(row,13), wb.datemode))
dateList.append(date.strftime('%m/%d'))
current = datetime.datetime(*xlrd.xldate_as_tuple(sheet3.cell_value(7,9), wb.datemode))
currentDate = current.strftime('%m/%d')
x1 = sheet3.cell_value(row,14)
x2 = sheet3.cell_value(row,15)
y1 = sheet3.cell_value(row,16)
y2 = sheet3.cell_value(row,17)
borderColor = 0
borderType = 0
colorValue = sheet3.cell_value(9,10)
colorFunc(x1,y1,x2,y2)
shape.Cells('FillforegndTrans').FormulaU = sheet3.cell_value(7,10)
if currentDate in dateList:
break

Is python pandas dataframe too slow?

I have an interesting problem. I have two files, NYPD_Motor_Collisions.csv has 1.2M lines and weatherfinal.txt has 109K lines. The objective is to merge the temp and prec data from weatherfinal.txt to the Collisions files as two columns based on the latitudes and longitudes. I wrote the following code using dataframe in pandas python.
from math import cos, asin, sqrt
import pandas as pd
import numpy as np
import os
import re
import datetime
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295
a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
return 12742 * asin(sqrt(a))
def closest(data, v):
return min(data, key=lambda p: distance(v['lat'],v['lon'],p['lat'],p['lon']))
tempDataList = []
#v = {'lat': 39.7622290, 'lon': -86.1519750}
#print(closest(tempDataList, v))
print os.getcwd()
filed_ = open("weatherfinal.txt", 'r')
fileo_ = open("weatherfinal_updated.txt","w")
lines_ = filed_.readlines()
for line_ in lines_:
outline = re.sub(" +"," ",line_)
fileo_.write(outline + "\n")
fileo_.close()
df = pd.read_csv("NYPD_Motor_Vehicle_Collisions.csv")
colhead = np.append(df.columns.values,['TEMP', 'PREP'])
outdf = pd.DataFrame(columns=colhead)
df2 = pd.read_csv("weatherfinal_updated.txt",' ')
df2.set_index(['WBANNO', 'LST_DATE', 'LST_TIME'])
sensorIds = df2['WBANNO'].unique()
for ids_ in sensorIds:
longitude = df2.loc[df2['WBANNO']==ids_,'LONGITUDE'].iloc[0]
latitude = df2.loc[df2['WBANNO'] == ids_, 'LATITUDE'].iloc[0]
tempDataList.append({'lat':latitude,'lon':longitude,'SENSORID': ids_ })
print tempDataList
for index, row in df.iterrows():
lon_ = row['LONGITUDE']
lat_ = row['LATITUDE']
tdate = row['DATE']
ttime = row['TIME']
tcal = 5
pcal = 0
fwdate = datetime.datetime.strptime(str(tdate), '%m/%d/%Y').strftime('%Y%m%d')
fwtime = datetime.datetime.strptime(str(ttime), '%H:%M').strftime('%H%M')
ntime = float(fwtime) + float(100)
closests_ = closest(tempDataList, {'lat':lat_,'lon':lon_})
sensorid = closests_['SENSORID']
usedSensorId = sensorid
selectedWeatherRow = df2.loc[(df2.WBANNO == sensorid) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime) ,['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
for sensId in sensorIds:
if sensId == sensorid:
continue
selectedWeatherRow = df2.loc[(df2.WBANNO == sensId) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime), ['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
continue
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
usedSensorId = sensId
break
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
row['TEMP'] = tcal
row['PREP'] = pcal
outdf.loc[index] = row
print index, tcal, pcal, fwdate, fwtime, ntime, usedSensorId
print "Loop completed"
outdf.to_csv("NYPD_TRAFFIC_DATA.csv")
print "file completed"
This program has been running for days. Not sure why dataframe is too slow. I rewrote the program without dataframe using dictionaries and it completed in a few minutes. Not sure if dataframe is slow or I am not using it correctly. Just posting here for learning.

Categories