could not convert string to float: 'Runny_nose' - python

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
Disease_data = pd.read_csv("Disease_dataset.csv")
X = Disease_data.drop(columns='Diseases')
y = Disease_data['Diseases']
model = DecisionTreeClassifier()
model.fit(X, y)
I get this error:
ValueError: could not convert string to float: 'Runny_nose'
I tried
Disease_data = Disease_data['Diseases'].astype(float)
and
music_data = pd.to_numeric(music_data, errors='coerce')
instead I get empty columns

Some of your lines might don't have valid float data.
Visit this thread for more info.

Related

ValueError: could not convert string to float: 'ID1'

import pandas as pd
from sklearn.linear_model import LinearRegression
data = {
'ID': ['ID1', 'ID2', 'ID3', 'ID4', 'ID5'],
'RMSE': [10.05616902165789, 9.496130901397015, 9.857060740380899,9.528204292426823,9.491117416326155]
}
df = pd.DataFrame(data)
X = df[['ID']]
y = df['RMSE']
reg = LinearRegression().fit(X, y)
preds = reg.predict(X)
mean_pred = preds.mean()
print('Mean of predicted RMSE values:', mean_pred)
how to resolve this error.
You are getting the error because your column, ID only contains str objects, which makes it impossible to convert to float. The X column must be numerical in order to work.

Error at running my script with demand forecast

I'm trying to make a script with demand forecast but my following code is giving this error, do you know how to solve it, please?
My code:
import pandas as pd
from sklearn.linear_model import LinearRegression
df = pd.read_excel("Dados.xlsx")
df['Data'] = pd.to_datetime(df['Data'], errors='coerce')
df['Data'] = df['Data'].dt.strftime('%m/%d')
dataset = pd.DataFrame({'Data': ['2022-12-06', '2022-12-07'],'Demanda': [870, 868]})
data = dataset.groupby(dataset['Data'].dt.strftime('%Y-%V'))["Demanda"].sum().reset_index()
NUM_PRED_DAYS = 5
ds = data.Date.values
ds_pred = pd.date_range(start=dataset["Data"].min(), periods=len(ds) + NUM_PRED_DAYS, freq="W")
dataset["Date"] = pd.to_datetime(dataset["Date"])
X = df[['Data']]
y = df['Demanda']
model = LinearRegression()
model.fit(X, y)
futura_datas = pd.DataFrame({'Data': pd.date_range(start='hoje', periods=5)})
futura_demanda = model.predict(futura_datas)
futura_datas['Demanda prevista'] = futura_demanda
print(futura_datas)
And the error is:
"Python311\Lib\site-packages\pandas\core\indexes\accessors.py", line 512, in __new__
raise AttributeError("Can only use .dt accessor with datetimelike values")
AttributeError: Can only use .dt accessor with datetimelike values. Did you mean: 'at'?"
I tried some codes that I founded here but no answer.
And my excel is like that: enter image description here

cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid

My input data is under the form:
gold,Program,MethodType,CallersT,CallersN,CallersU,CallersCallersT,CallersCallersN,CallersCallersU,CalleesT,CalleesN,CalleesU,CalleesCalleesT,CalleesCalleesN,CalleesCalleesU,CompleteCallersCallees,classGold
T,chess,Inner,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
T,chess,Inner,Low,-1,-1,Low,-1,-1,Medium,-1,Medium,High,-1,High,0,Trace,
T,chess,Inner,Low,-1,-1,Low,-1,-1,Medium,-1,Medium,High,-1,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,Low,Low,High,Medium,-1,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,Low,Low,Medium,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
T,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,Low,Low,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,Low,Low,High,Low,Low,Medium,0,Trace,
N,chess,Inner,Low,-1,-1,-1,-1,-1,Low,Low,High,Low,Low,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
....
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,NoTrace,
T,chess,Inner,Low,-1,-1,Low,Low,-1,Low,-1,Low,-1,-1,-1,0,Trace,
T,chess,Inner,Low,-1,-1,Medium,-1,-1,Low,-1,Low,-1,-1,-1,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,NoTrace,
I am reading my data and I am trying to concatenate two data sets that are subsets of the original data set, here is the code I am using:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
# Feature Scaling
from sklearn.preprocessing import StandardScaler
SeparateProjectLearning=False
CompleteCallersCallees=False
PartialTrainingSetCompleteCallersCallees=True
def main():
X_train={}
X_test={}
y_train={}
y_test={}
dataset = pd.read_csv( 'InputData.txt', sep= ',', index_col=False)
#convert T into 1 and N into 0
dataset['gold'] = dataset['gold'].astype('category').cat.codes
dataset['Program'] = dataset['Program'].astype('category').cat.codes
dataset['classGold'] = dataset['classGold'].astype('category').cat.codes
dataset['MethodType'] = dataset['MethodType'].astype('category').cat.codes
dataset['CallersT'] = dataset['CallersT'].astype('category').cat.codes
dataset['CallersN'] = dataset['CallersN'].astype('category').cat.codes
dataset['CallersU'] = dataset['CallersU'].astype('category').cat.codes
dataset['CallersCallersT'] = dataset['CallersCallersT'].astype('category').cat.codes
dataset['CallersCallersN'] = dataset['CallersCallersN'].astype('category').cat.codes
dataset['CallersCallersU'] = dataset['CallersCallersU'].astype('category').cat.codes
dataset['CalleesT'] = dataset['CalleesT'].astype('category').cat.codes
dataset['CalleesN'] = dataset['CalleesN'].astype('category').cat.codes
dataset['CalleesU'] = dataset['CalleesU'].astype('category').cat.codes
dataset['CalleesCalleesT'] = dataset['CalleesCalleesT'].astype('category').cat.codes
dataset['CalleesCalleesN'] = dataset['CalleesCalleesN'].astype('category').cat.codes
dataset['CalleesCalleesU'] = dataset['CalleesCalleesU'].astype('category').cat.codes
pd.set_option('display.max_columns', None)
row_count, column_count = dataset.shape
Xcol = dataset.iloc[:, 1:column_count]
CompleteSet=dataset.loc[dataset['CompleteCallersCallees'] == 1]
CompleteSet_X = CompleteSet.iloc[:, 1:column_count].values
CompleteSet_Y = CompleteSet.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(CompleteSet_X, CompleteSet_Y, test_size = 0.2, random_state = 0)
TestSet=dataset.loc[dataset['CompleteCallersCallees'] == 0]
X_test1=TestSet.iloc[:, 1:column_count].values
X_test=pd.concat(X_test1,X_test)
I want to build my own test set and training set by using concatenation and I am trying to concatenate X_test1 and X_test in the code above. However, the problem is that I am getting an error for the last line of code X_test=pd.concat(X_test1,X_test) and the error says TypeError: cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid. How can I fix this?
By adding .values to the end of your filters in the following lines:
CompleteSet_X = CompleteSet.iloc[:, 1:column_count].values
CompleteSet_Y = CompleteSet.iloc[:, 0].values
X_test1=TestSet.iloc[:, 1:column_count].values
You are extracting the underlying Numpy ndarray from the Pandas Series/DataFrame the prior code extracts, just remove .values at the end and you can use concat directly with the Series or DataFrame.

How can use scipy with a datetime without the right formatting?

I am trying to visualise a dataset and its average with scipy.interpolate and matplotlb.
But when im trying to run the code that should work perfectly fine it gives me the error:
File "mittel.py", line 19, in <module>
p1 = polyfit(x, y, 1)
File "C:\Users\simon\AppData\Local\Programs\Python\Python37-32\lib\site-packages\numpy\lib\polynomial.py", line 589, in polyfit
x = NX.asarray(x) + 0.0
TypeError: can only concatenate str (not "float") to str
And the code is:
import time as ti
import pandas as pd
from numpy import *
from matplotlib import pyplot as plt
import csv
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from scipy.interpolate import *
data = pd.read_csv("includes\\csv.csv")
x = array(data["day"])
y = array(data["balance"])
p1 = polyfit(x, y, 1)
print(p1)
plt.plot(x, y, "o")
plt.plot(x, polyval(p1, x), "-r")
plt.show()
I have already tried to convert the x array to a string with
x = str(x)
but that didnt help at all.
My csv file looks like this:
balance,day
242537,28-5
246362,29-5
246659,30-5
246844,31-5
Do you know why that error accurs?
x = NX.asarray(x) + 0.0
TypeError: can only concatenate str (not "float") to str
As you can see here, + is interpreted to concatenate two strings, whereas you need to add float. So instead of converting x to a string object, try converting x to a float object:
x = float(array(data["day"]))
y = float(array(data["balance"]))

How can I get class names back when using MultiLabelBinarizer

I have a csv file which looks like this:
target,data
AAA,some text document
AAA;BBB,more text
AAC,more text
Here is the code:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import BernoulliNB
import pandas as pd
pdf = pd.read_csv("Train.csv", sep=',')
pdfT = pd.read_csv("Test.csv", sep=',')
X1 = pdf['data']
Y1 = [[t for t in tar.split(';')] for tar in pdf['target']]
X2 = pdfT['data']
Y2 = [[t for t in tar.split(';')] for tar in pdfT['target']]
# Vectorizer data
hv = HashingVectorizer(stop_words='english', non_negative=True)
X1 = hv.transform(X1)
X2 = hv.transform(X2)
mlb = MultiLabelBinarizer()
mlb.fit(Y1+Y2)
Y1 = mlb.transform(Y1)
# mlb.classes_ looks like ['AAA','AAC','BBB',...] len(mlb.classes_)==1363
# Y1 looks like [[0,0,0,....0,0,0], ... ] now
# fit
clsf = OneVsRestClassifier(BernoulliNB(alpha=.001))
clsf.fit(X1,Y1)
# predict_proba
proba = clsf.predict_proba(X2)
# want to get class names back
classnames = mlb.inverse_transform(clsf.classes_) # booom, shit happens
for i in range(len(proba)):
# get classnames,probability dict
preDict = dict(zip(classnames, proba[i]))
# sort dict by probability value, print actual and top 5 predict results
print(Y2[i], dict(sorted(preDict.items(),key=lambda d:d[1],reverse=True)[0:5]))
The problem is after clsf.fit(X1,Y1)
clsf.classes_ is an int array [0,1,2,3,...1362]
why is it not like Y1? How can I get the classnames from clsf.classes_? mlb.classes_ == clsf.classes_ or not, with same order?
When you fit OneVsRestClassifier with multiple labels a LabelBinarizer is called during the fit call, which will convert the the multilabels into unique labels for each class.
You can access the label_binarizer_ attribute of the clsf object, which has an attribute for classes that will contain the class definition for classes fit in the call to clsf.

Categories