String to float error in my python project I am stumped - python

# Python code to demonstrate SQL to fetch data.
# importing the module
import sqlite3
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from scipy.stats import chisquare
# connect withe the myTable database
connection = sqlite3.connect(r"C:\Users\Aidan\Desktop\CEP_DB.db")
# cursor object
crsr = connection.cursor()
dog= crsr.execute("Select s, ei, ki FROM cep_db_lite1_vc WHERE s IN ('d')")
ans= crsr.fetchall()
filtered_data = ans[~np.isnan(ans["ki"])]
dogData = np.array(filtered_data)
dogData.astype(float)
FdogData= dogData[:, [1,2]]
x, y = FdogData[:,0], FdogData[:,1]
# Reshaping
x, y = x.reshape(-1,1), y.reshape(-1, 1)
# Linear Regression Object
lin_regression = LinearRegression()
# Fitting linear model to the data
lin_regression.fit(x,y)
# Get slope of fitted line
m = lin_regression.coef_
# Get y-Intercept of the Line
b = lin_regression.intercept_
# Get Predictions for original x values
# you can also get predictions for new data
predictions = lin_regression.predict(x)
chi= chisquare(predictions, y)
# following slope intercept form
print ("formula: y = {0}x + {1}".format(m, b))
print(chi)
# Plot the Original Model (Black) and Predictions (Blue)
plt.scatter(x, y, color='black')
plt.plot(x, predictions, color='blue',linewidth=3)
plt.show()
Error: runfile('C:/Users/Aidan/.spyder-py3/temp.py',
wdir='C:/Users/Aidan/.spyder-py3') Traceback (most recent call last):
File "", line 1, in
runfile('C:/Users/Aidan/.spyder-py3/temp.py', wdir='C:/Users/Aidan/.spyder-py3')
File
"C:\Users\Aidan\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 705, in runfile
execfile(filename, namespace)
File
"C:\Users\Aidan\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Aidan/.spyder-py3/temp.py", line 22, in
filtered_data = ans[~np.isnan(ans["ki"])]
TypeError: list indices must be integers or slices, not str
I believe I am recieving errors in this section:
dog= crsr.execute("Select s, ei, ki FROM cep_db_lite1_vc WHERE s IN ('d')")
ans= crsr.fetchall()
filtered_data = ans[~np.isnan(ans["ki"])]
dogData = np.array(filtered_data)
dogData.astype(float)
FdogData= dogData[:, [1,2]]
x, y = FdogData[:,0], FdogData[:,1]
How can I fix this script? I have been struggling with this error for weeks to no avail!

TypeError: list indices must be integers or slices, not str
That means you can't index a list with str, only integers and slices. With that, you can find that in this line:
filtered_data = ans[~np.isnan(ans["ki"])]
ans is a list, and can't be indexed by a string.

Related

How to implement multiprocessing in a for loop inside a function

I've built some code to minimize the sum of the weighted least squares of some residuals. I first read all the data from a .gz file and then process it on the code below (details are irrelevant). I want to use multiprocessing in order to speed up the "runFit" function.
My code is below:
"""
Fit 3D lines to cylinders
"""
from timeit import default_timer as timer
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from scipy.optimize import minimize
from numba import jit
from multiprocessing import Pool
def readData(filename):
"Read compressed data."
return np.loadtxt(filename, delimiter=",")
#jit(nopython=True)
def weightedResiduals(unknown, wire_coords, radii, d_radii, d_zcoords):
"Calculates the sum of the weighted residuals"
y_intercept = unknown[0]
z_intercept = unknown[1]
xy_slope = unknown[2]
xz_slope = unknown[3]
intercept_vector = np.array([0, y_intercept, z_intercept])
gradient_vector = np.array([1, xy_slope, xz_slope])
gradient_vector /= np.linalg.norm(gradient_vector)
result = 0
for index in range(np.shape(wire_coords)[0]):
distance = np.linalg.norm(np.cross((wire_coords[index]-intercept_vector), gradient_vector)) - radii[index]
weight = (d_radii[index]**2 + d_zcoords[index]**2)**(-1/2)
result += (weight * distance)**2
return result
def runFit(inputfilename, outputfilename):
"""
Parameters
----------
inputfilename : string
input data file name for fitting.
outputfilename : string
result storage file name.
Returns
-------
counter : int
number of successful fits; 100% would be twice the number
of events (two lines per event).
"""
counter = 0
#Reading the required data set
fulldata = readData(inputfilename)
#Defining the output array and filling in the first two columns
event_no = int(fulldata[-1,0])
result = np.zeros((2*event_no, 10))
result[:,0] = np.repeat(np.arange(1, event_no+1), 2)
line_no_array = np.empty((2*event_no,))
line_no_array[::2] = 1
line_no_array[1::2] = 2
result[:,1] = line_no_array
def singleEventFit(event):
#Using masking to obtain required rows
mask = (fulldata==event)
desired_rows = mask[:, 0]
#Calculating the fitted line variables using weighted least squares
for line in range(1,3):
#Extracting the desired rows from the full data array
desired_array = fulldata[np.logical_and(desired_rows,(fulldata==line)[:,1])]
#Extracting grouped data from the desired rows
wire_coords = desired_array[:,2:5]
wire_x_coords = wire_coords[:,0]
wire_y_coords = wire_coords[:,1]
wire_z_coords = wire_coords[:,2]
radii = desired_array[:,5]
d_radii, d_zcoords = desired_array[:,6], desired_array[:,7]
#Estimating an initial guess for the fitted line variables
x_min_index = np.argmin(np.abs(wire_x_coords))
x_max_index = np.argmax(np.abs(wire_x_coords))
y_intercept_guess = wire_y_coords[x_min_index]
z_intercept_guess = wire_z_coords[x_min_index]
xy_slope_guess = (wire_y_coords[x_max_index]-wire_y_coords[x_min_index])/(wire_x_coords[x_max_index]-wire_x_coords[x_min_index])
xz_slope_guess = (wire_z_coords[x_max_index]-wire_z_coords[x_min_index])/(wire_x_coords[x_max_index]-wire_x_coords[x_min_index])
init = np.array([y_intercept_guess, z_intercept_guess, xy_slope_guess, xz_slope_guess])
#Minimizing the sum of the weighted residuals
fit_vars = minimize(weightedResiduals, init, args=(wire_coords, radii, d_radii, d_zcoords), tol=1e-5)
if fit_vars.success == True:
y_intercept, z_intercept = fit_vars.x[0], fit_vars.x[1]
xy_slope, xz_slope = fit_vars.x[2], fit_vars.x[3]
#Using the half of the inverse of the Hessian matrix as the covariance matrix to recover errors
std_array = np.sqrt(np.diag(0.5*fit_vars.hess_inv))
#Inputting the variables and their errors on the output array
result[2*event+line-3, 2], result[2*event+line-3, 4] = y_intercept, xy_slope
result[2*event+line-3, 6], result[2*event+line-3, 8] = z_intercept, xz_slope
result[2*event+line-3, 3], result[2*event+line-3, 5] = std_array[0], std_array[2]
result[2*event+line-3, 7], result[2*event+line-3, 9] = std_array[1], std_array[3]
with Pool() as pool:
pool.map(singleEventFit, [event for event in range(1, event_no+1)])
#Returning resulting array as a text file
np.savetxt(outputfilename, result, delimiter=',')
return counter
start = timer()
if __name__=='__main__':
print("Successful Plots: " + str(runFit("tendata.txt.gz", "output.txt.gz")))
end = timer()
print("Time: " + str(end-start) + "s")
However, I get the following traceback:
Traceback (most recent call last):
File "C:\Users\vanes\Downloads\Python Project\untitled0.py", line 113, in <module>
print("Successful Plots: " + str(runFit("tendata.txt.gz", "output.txt.gz")))
File "C:\Users\vanes\Downloads\Python Project\untitled0.py", line 105, in runFit
pool.map(singleEventFit, [event for event in range(1, event_no+1)])
File "C:\Users\vanes\anaconda3\lib\multiprocessing\pool.py", line 364, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\vanes\anaconda3\lib\multiprocessing\pool.py", line 771, in get
raise self._value
File "C:\Users\vanes\anaconda3\lib\multiprocessing\pool.py", line 537, in _handle_tasks
put(task)
File "C:\Users\vanes\anaconda3\lib\multiprocessing\connection.py", line 211, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "C:\Users\vanes\anaconda3\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
AttributeError: Can't pickle local object 'runFit.<locals>.singleEventFit'
Is there any way that I can use multiprocessing in order to speed up the for-loop?
After reviewing the internet, the recommendation was to move the inner function outside and make it global. However, this can't work since I need variables defined inside "runFit()" in order to execute the loop.

VALUE ERROR: x, y, and format string must not be None. (error while plotting hysteresis loop)

With the low cycle fatigue data, I'm trying to plot the Hysteresis loop. But I'm getting the following error:
[ -52.18297297 -45.58565338 16.9913185 ... -354.53630032 -295.50857248
-155.42088911]
[-0.01229182 -0.00891753 0.02256744 ... -0.33507242 -0.31283728
-0.24790212]
Traceback (most recent call last):
File "f:\I2M\LCF\Ep1_camp4_P4_TTH650 06-9-21 11 01 24\ep1_camp4_P4.py", line 16, in <module>
plt.plot(strain, Sigma, color = 'k')
File "C:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-
packages\matplotlib\pyplot.py", line 2840, in plot
return gca().plot(
File "C:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-
packages\matplotlib\axes\_axes.py", line 1743, in plot
lines = [*self._get_lines(*args, data=data, **kwargs)]
File "C:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-
packages\matplotlib\axes\_base.py", line 273, in __call__
yield from self._plot_args(this, kwargs)
File "C:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-
packages\matplotlib\axes\_base.py", line 379, in _plot_args
raise ValueError("x, y, and format string must not be None")
ValueError: x, y, and format string must not be None
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
plt.style.use(['science','no-latex'])
x = np.loadtxt('F:\\I2M\\LCF\\Ep1_camp4_P4_TTH650 06-9-21 11 01 24\\data_1.csv',unpack = True,
skiprows = 2, usecols = 2, delimiter = ',')
y = np.loadtxt('F:\\I2M\\LCF\\Ep1_camp4_P4_TTH650 06-9-21 11 01 24\\data_1.csv',unpack = True,
skiprows = 2, usecols = 3, delimiter = ',')
stress = (x*1000)/28.27 #N/mm^2 = MPa
length = len(stress)
length = len(y)
plt.figure(figsize=(5, 5))
Sigma = print(stress[0:length:10]) #stress
strain = print(y[0:length:10])
plt.plot(strain, Sigma, color = 'k')
plt.show()
Data contains many rows. So I used some commands to access only particular values from the row
Your problem is here
Sigma = print(stress[0:length:10]) #stress
strain = print(y[0:length:10])
what you want plausibly is to sample every 10th data point, but what you get is … nothing or, from the point of view of Python: None, so that later your stack trace informs you that x, y, and format string must not be None.
Why this happens, and how you solve the problem?
When you make an assignment, the value of the expression on the right is saved and you can use the name on the left to use it later, so you save, e.g., the value returned by print(y[0:length:10]) to use it later under the name strain, but print() is used for its side effects (i.e., showing a bunch of characters on your terminal) and the value that is returned in these cases is by default None, not what was shown on your terminal.
If I have understood your intentions, you should omit the two lines above and just use
plt.plot(x[0:length:10], y[0:length:10], color='k')
A side note, you have
length = len(stress)
length = len(y)
but you read them from the same file, one assignment should be enough…
PS
x, y = np.loadtxt('…\\data_1.csv', unpack=1, skiprows=2, usecols=[2,3], delimiter=',')

Numexpr in Python doesn't recognise a declared symbol

I'm trying to do some plots of some symbolic data. I have some expression from a regression in the form:
expr = '(((((((((1.0)*(2.0)))-(ER)))-(-0.37419122066665467))*0.006633039574629684)*(0.006633039574629684*((((T)-(((1.0)+(P)))))-(P))))+0.1451920626347467)'
Where expr here is some prediction: f = f(T, P, ER). I know this particular example is a crazy expression but it's not really super important. Basically, supposing I have some dataframe, plotdata, I am trying to produce plots with:
import pandas
import sympy
import numexpr
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
expr = '(((((((((1.0)*(2.0)))-(ER)))-(-0.37419122066665467))*0.006633039574629684)*(0.006633039574629684*((((T)-(((1.0)+(P)))))-(P))))+0.1451920626347467)'
#Extract some data for surface plot but fixing one variable
plotdata = plotdata.loc[(plotdata.P == 1)]
#Extract data as lists for plotting
x = list(plotdata['T'])
y = list(plotdata['ER'])
f_real = list(plotdata['f'])
T_sympy = sympy.Symbol('T')
P_sympy = sympy.Symbol('P')
ER_sympy = sympy.Symbol('ER')
f_pred = numexpr.evaluate(expr)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_trisurf(x,y,f_real, alpha = 0.3)
ax.plot_surface(x,y,f_pred)
However, I am getting an error with f_pred.
numexpr.evaluate(expr)
Traceback (most recent call last):
File "/anaconda3/lib/python3.7/site-packages/numexpr/necompiler.py", line 744, in getArguments
a = local_dict[name]
KeyError: 'ER'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<ipython-input-100-c765b0f1e5ce>", line 1, in <module>
numexpr.evaluate(expr)
File "/anaconda3/lib/python3.7/site-packages/numexpr/necompiler.py", line 818, in evaluate
arguments = getArguments(names, local_dict, global_dict)
File "/anaconda3/lib/python3.7/site-packages/numexpr/necompiler.py", line 746, in getArguments
a = global_dict[name]
KeyError: 'ER'
I am not super familiar with the numexpr package. However, I have been building this up from a 1D regression to now a 3D regression. ER was my 1D variable and was working fine. I have obviously slightly altered my code since the 1D case but I am still slightly at a loss as to why this error is popping up.
Any pointers would be greatly appreciated.
I've figured it out. Pretty silly error in the end. I needed to change:
#Extract data as lists for plotting
x = list(plotdata['T'])
y = list(plotdata['ER'])
to:
T = list(plotdata['T'])
ER = list(plotdata['ER'])
P = list(plotdata['P'])
i.e. numexpr.evaluate was looking for the input data, not the symbol!

ValueError when using lmfit LognormalModel

I have been using lmfit for about a day now and needless to say I know very little about the library. I have been using several built-in models for curve fitting and all of them work flawlessly with the data except the Lognormal Model.
Here is my code:
from numpy import *
from lmfit.models import LognormalModel
import pandas as pd
import scipy.integrate as integrate
import matplotlib.pyplot as plt
data = pd.read_csv('./data.csv', delimiter = ",")
x = data.ix[:, 0]
y = data.ix[:, 1]
print (x)
print (y)
mod = LognormalModel()
pars = mod.guess(y, x=x)
out = mod.fit(y, pars , x=x)
print(out.best_values)
print(out.fit_report(min_correl=0.25))
out.plot()
plt.plot(x, y, 'bo')
plt.plot(x, out.init_fit, 'k--')
plt.plot(x, out.best_fit, 'r-')
plt.show()
and the error output is:
Traceback (most recent call last):
File "Cs_curve_fit.py", line 17, in <module>
pvout = pvmod.fit(y, amplitude= 1, center = 1, sigma =1 , x=x)
File "C:\Users\NAME\Anaconda3\lib\site-packages\lmfit\model.py", line 731, in fit
output.fit(data=data, weights=weights)
File "C:\Users\NAME\Anaconda3\lib\site-packages\lmfit\model.py", line 944, in fit
self.init_fit = self.model.eval(params=self.params, **self.userkws)
File "C:\Users\NAME\Anaconda3\lib\site-packages\lmfit\model.py", line 569, in eval
return self.func(**self.make_funcargs(params, kwargs))
File "C:\Users\NAME\Anaconda3\lib\site-packages\lmfit\lineshapes.py", line 162, in lognormal
x[where(x <= 1.e-19)] = 1.e-19
File "C:\Users\NAME\Anaconda3\lib\site-packages\pandas\core\series.py", line 773, in __setitem__
setitem(key, value)
File "C:\Users\NAME\Anaconda3\lib\site-packages\pandas\core\series.py", line 755, in setitem
raise ValueError("Can only tuple-index with a MultiIndex")
ValueError: Can only tuple-index with a MultiIndex
First, the error message you show cannot have come from the code you post. The error message says that line 17 of the file "Cs_curve_fit.py" reads
pvout = pvmod.fit(y, amplitude= 1, center = 1, sigma =1 , x=x)
but that is not anywhere in your code. Please post the actual code and the actual output.
Second, the problem appears to happening because the data for x is cannot be turned into a 1D numpy array. Not being able to trust your code or output, I would just suggest converting the data to 1D numpy arrays yourself as a first test. Lmfit should be able to handle Pandas series, but it just does a simple coercion to 1D numpy arrays.

Debug TypeError: unhashable type: 'numpy.ndarray'

I am working on a kmeans clustering. I have write down a code with the help of some available references on the web but when I run this code it fires an error:
Traceback (most recent call last):
File "clustering.py", line 16, in <module>
ds = df[np.where(labels==i)]
File "/usr/lib/python2.7/dist-packages/pandas/core/frame.py", line 1678, in __getitem__
return self._getitem_column(key)
File "/usr/lib/python2.7/dist-packages/pandas/core/frame.py", line 1685, in _getitem_column
return self._get_item_cache(key)
File "/usr/lib/python2.7/dist-packages/pandas/core/generic.py", line 1050, in _get_item_cache
res = cache.get(item)
TypeError: unhashable type: 'numpy.ndarray'
Though, many previous threads are available with the same error but there is no single solution available that can handle this error in my program. How can I debug this error ?
Code which i used:
from sklearn import cluster
import pandas as pd
df = [
[0.57,-0.845,-0.8277,-0.1585,-1.616],
[0.47,-0.14,-0.5277,-0.158,-1.716],
[0.17,-0.845,-0.5277,-0.158,-1.616],
[0.27,-0.14,-0.8277,-0.158,-1.716]]
df = pd.DataFrame(df,columns= ["a","b","c","d", "e"])
# df = pd.read_csv("cleaned_remove_cor.csv")
k = 3
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(df)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
from matplotlib import pyplot
import numpy as np
for i in range(k):
# select only data observations with cluster label == i
ds = df[np.where(labels==i)]
# plot the data observations
pyplot.plot(ds[:,0],ds[:,1],'o')
# plot the centroids
lines = pyplot.plot(centroids[i,0],centroids[i,1],'kx')
# make the centroid x's bigger
pyplot.setp(lines,ms=15.0)
pyplot.setp(lines,mew=2.0)
pyplot.show()
The shape of my DataFrame is (8127x600)
I tried and this works for me, conversion of pandas df to numpy matrix:
df = df.as_matrix(columns= ["a","b","c","d", "e"])

Categories