Issues with "setting an array element with a sequence" - python

I'm having issues with a "setting an array element with a sequence error.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
df = pd.read_csv('googleplaystore.csv') # 1
df = df.dropna() # 3
df['Size'] = df['Size'].str.extract(r'(\d+\.?\d)', expand=False).astype(float) * df['Size'].str[-1].replace({'M': 1024, 'k': 1}) # 4
df = df.dropna() # remove nan from "Varies with device"
df['Price'] = df['Price'].str.strip('$').astype(float) # 5
df['Installs'] = df['Installs'].str.strip('+')
df['Installs'] = df['Installs'].str.replace(',',"").astype(int)
df['Reviews'] = df['Reviews'].astype(float)
df['Size'] = df['Size'].astype(float)
df = df.loc[df['Rating'].between(1, 5)] # 6
df = df.loc[df['Type'] != 'Free'] # 7
df.drop(df[df['Price'] >= 200].index, inplace = True)
df.drop(df[df['Reviews'] >2000000].index, inplace = True)
df.drop(df[df['Installs'] >10000].index, inplace = True)
inp1 = df.copy()
df_reviewslog=np.log10(df['Reviews'])
df_installslog=np.log10(df['Installs'])
del df['App']
del df['Last Updated']
del df['Current Ver']
del df['Android Ver']
pd.get_dummies(df, columns=['Category', 'Genres', 'Content Rating'], drop_first=True)
inp2 = df.copy()
df_train = X_train,X_test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.7, random_state=0)
df_test = X_train,X_Test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.3, random_state=0)
df_train = np.array(df_train, dtype=object)
df_test = np.array(df_test, dtype=object)
df_train[0] = np.array([4])
df_test[0] = np.array([4])
df_train_1= df_train.reshape(4,1)
df_test_1= df_test.reshape(4,1)
#df_train_1
#df_test_1
model = LinearRegression().fit(df_train_1, df_test_1)
r_sq = model.score(df_train_1, df_test_1)
print(r_sq)
I keep making adjustments to my arrays to get them to work, but I keep getting this error:
"ValueError: setting an array element with a sequence." I can't figure out how to change it to get it to work.

Related

Converting strings to numpy arrays to run a for loop on dataframe linear regression

import datetime
import numpy as np
from sklearn.linear_model import LinearRegression
import os
os.chdir(r'path')
df = pd.read_excel("test.xlsx")
print(df)
def process_date(date_str):
# the date format is month-day-year
d = datetime.datetime.strptime(date_str, '%m-%d-%Y')
return d.timestamp()
df['Date'] = df['Date'].apply(process_date)
df.head()
for (columnName, columnData) in df.iteritems():
cn = columnName
cd = columnData
print('Column Name : ', columnName)
print('Column Contents : ', columnData.values)
X = np.array(cn).reshape(-1,1)
y = cd.to_numpy()
model = LinearRegression()
model.fit(X, y)
ValueError: Unable to convert array of bytes/strings into decimal numbers with dtype='numeric'
Here is my dataset

cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid

My input data is under the form:
gold,Program,MethodType,CallersT,CallersN,CallersU,CallersCallersT,CallersCallersN,CallersCallersU,CalleesT,CalleesN,CalleesU,CalleesCalleesT,CalleesCalleesN,CalleesCalleesU,CompleteCallersCallees,classGold
T,chess,Inner,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
T,chess,Inner,Low,-1,-1,Low,-1,-1,Medium,-1,Medium,High,-1,High,0,Trace,
T,chess,Inner,Low,-1,-1,Low,-1,-1,Medium,-1,Medium,High,-1,High,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,Low,Low,High,Medium,-1,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,Low,Low,Medium,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
T,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,Low,Low,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,Low,Low,High,Low,Low,Medium,0,Trace,
N,chess,Inner,Low,-1,-1,-1,-1,-1,Low,Low,High,Low,Low,Medium,0,Trace,
N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace,
....
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,NoTrace,
T,chess,Inner,Low,-1,-1,Low,Low,-1,Low,-1,Low,-1,-1,-1,0,Trace,
T,chess,Inner,Low,-1,-1,Medium,-1,-1,Low,-1,Low,-1,-1,-1,0,Trace,
N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,NoTrace,
I am reading my data and I am trying to concatenate two data sets that are subsets of the original data set, here is the code I am using:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
# Feature Scaling
from sklearn.preprocessing import StandardScaler
SeparateProjectLearning=False
CompleteCallersCallees=False
PartialTrainingSetCompleteCallersCallees=True
def main():
X_train={}
X_test={}
y_train={}
y_test={}
dataset = pd.read_csv( 'InputData.txt', sep= ',', index_col=False)
#convert T into 1 and N into 0
dataset['gold'] = dataset['gold'].astype('category').cat.codes
dataset['Program'] = dataset['Program'].astype('category').cat.codes
dataset['classGold'] = dataset['classGold'].astype('category').cat.codes
dataset['MethodType'] = dataset['MethodType'].astype('category').cat.codes
dataset['CallersT'] = dataset['CallersT'].astype('category').cat.codes
dataset['CallersN'] = dataset['CallersN'].astype('category').cat.codes
dataset['CallersU'] = dataset['CallersU'].astype('category').cat.codes
dataset['CallersCallersT'] = dataset['CallersCallersT'].astype('category').cat.codes
dataset['CallersCallersN'] = dataset['CallersCallersN'].astype('category').cat.codes
dataset['CallersCallersU'] = dataset['CallersCallersU'].astype('category').cat.codes
dataset['CalleesT'] = dataset['CalleesT'].astype('category').cat.codes
dataset['CalleesN'] = dataset['CalleesN'].astype('category').cat.codes
dataset['CalleesU'] = dataset['CalleesU'].astype('category').cat.codes
dataset['CalleesCalleesT'] = dataset['CalleesCalleesT'].astype('category').cat.codes
dataset['CalleesCalleesN'] = dataset['CalleesCalleesN'].astype('category').cat.codes
dataset['CalleesCalleesU'] = dataset['CalleesCalleesU'].astype('category').cat.codes
pd.set_option('display.max_columns', None)
row_count, column_count = dataset.shape
Xcol = dataset.iloc[:, 1:column_count]
CompleteSet=dataset.loc[dataset['CompleteCallersCallees'] == 1]
CompleteSet_X = CompleteSet.iloc[:, 1:column_count].values
CompleteSet_Y = CompleteSet.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(CompleteSet_X, CompleteSet_Y, test_size = 0.2, random_state = 0)
TestSet=dataset.loc[dataset['CompleteCallersCallees'] == 0]
X_test1=TestSet.iloc[:, 1:column_count].values
X_test=pd.concat(X_test1,X_test)
I want to build my own test set and training set by using concatenation and I am trying to concatenate X_test1 and X_test in the code above. However, the problem is that I am getting an error for the last line of code X_test=pd.concat(X_test1,X_test) and the error says TypeError: cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid. How can I fix this?
By adding .values to the end of your filters in the following lines:
CompleteSet_X = CompleteSet.iloc[:, 1:column_count].values
CompleteSet_Y = CompleteSet.iloc[:, 0].values
X_test1=TestSet.iloc[:, 1:column_count].values
You are extracting the underlying Numpy ndarray from the Pandas Series/DataFrame the prior code extracts, just remove .values at the end and you can use concat directly with the Series or DataFrame.

loop through dataframe columns to do simple linear regression?

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
df = pd.read_excel("Book1.xlsx")
for column in df:
X = df["Row Labels"]
Y = df[column]
y1 =Y.values.reshape(-1,1)
x1 =X.values.reshape(-1,1)
regressor = LinearRegression()
regressor.fit(x1, y1)
y_new = []
y_i = []
for i in range(12,24):
y_new.append(regressor.predict([[i]]))
y_i.append(i)
df2 = pd.DataFrame({'column':y_new})
i write this code to loop through the dataframe columns to do simple linear regression and put all the predicted value in dataframe. but it is predicting only the last columns value.
df2 = pd.DataFrame({'column':y_new}) creates a column named 'column' verbatim (not the name saved in the variable column. Moreover, df2 is recreated in every iteration, each iteration it only saves the last y_new.
I think what you want is to create a new column in df2 in each iteration:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
df = pd.read_excel("Book1.xlsx")
df2 = pd.DataFrame()
for column in df:
X = df["Row Labels"]
Y = df[column]
y1 =Y.values.reshape(-1,1)
x1 =X.values.reshape(-1,1)
regressor = LinearRegression()
regressor.fit(x1, y1)
y_new = []
y_i = []
for i in range(12,24):
y_new.append(regressor.predict([[i]]))
y_i.append(i)
df2[column] = y_new

Summarize Loop Results in Pandas Table

I got the code that downloads tickers and runs the linear regression for each stock in the downloaded list. I am stuck on the last step: showing Prediction & Residual values for each stock, for the last date in the data.
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as dt
from sklearn import linear_model
tickers = ['EXPE','MSFT']
data = yf.download(tickers, start="2012-04-03", end="2017-07-07")['Close']
data = data.reset_index()
data = data.dropna()
df = pd.DataFrame(data, columns = ["Date"])
df["Date"]=df["Date"].apply(lambda x: x.toordinal())
for ticker in tickers:
data[ticker] = pd.DataFrame(data, columns = [ticker])
X = df
y = data[ticker]
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
predictions = lm.predict(X)
residuals = y-lm.predict(X)
print (predictions[-1:])
print(residuals[-1:])
The current output looks like this:
[136.28856636]
1323 13.491432
Name: EXPE, dtype: float64
[64.19943648]
1323 5.260563
Name: MSFT, dtype: float64
But I would like it to show like this (as pandas table):
Predictions Residuals
EXPE 136.29 13.49
MSFT 64.20 5.26
You could do something like this where you store values in a list:
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as dt
from sklearn import linear_model
tickers = ['EXPE','MSFT']
data = yf.download(tickers, start="2012-04-03", end="2017-07-07")['Close']
data = data.reset_index()
data = data.dropna()
df = pd.DataFrame(data, columns = ["Date"])
df["Date"]=df["Date"].apply(lambda x: x.toordinal())
predictions_output = []
residuals_output = []
for ticker in tickers:
data[ticker] = pd.DataFrame(data, columns = [ticker])
X = df
y = data[ticker]
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
predictions = lm.predict(X)
residuals = y-lm.predict(X)
predictions_output.append(float(predictions[-1:]))
residuals_output.append(float(residuals[-1:]))
expectation_df = pd.DataFrame(list(zip(predictions_output, residuals_output)),
columns =['Predictions', 'Residuals']).set_index([tickers])
print(expectation_df)
with the output being:
Predictions Residuals
EXPE 136.288566 13.491432
MSFT 64.199436 5.260563
EDIT: I went too quickly and looked back and realized tickers was already defined, so you can use that to set your index here and lose the Tickers index heading to match your desired output.
Also if you want those values rounded, you can just append these two lines in your loop:
predictions_output.append(round(float(predictions[-1:]), 2))
residuals_output.append(round(float(residuals[-1:]), 2))

Convert DF into Numpy Array for calculations

I have the data in a dataframe format that I will use for linear regression calculation using user-built function. Here is the code:
from sklearn.datasets import load_boston
boston = load_boston()
bos = pd.DataFrame(boston.data) # convert to DF
bos.columns = boston.feature_names
bos['PRICE'] = boston.target
y = bos.PRICE
x = bos.drop('PRICE', axis = 1) # DROP PRICE since only want X-type variables (not Y-target)
xw = df.to_array(x)
xw = np.insert(xw,0,1, axis = 1) # to insert a column of "1" values
However, I am getting the error:
AttributeError Traceback (most recent call last)
<ipython-input-131-272f1b4d26ba> in <module>()
1 import copy
2
----> 3 xw = df.to_array(x)
AttributeError: 'int' object has no attribute 'to_array'
I am not sure where the problem. I need to pass an array of values (x in this case) to the function to execute some matrix operations
The insert function was working in a step by step code development but for some reason is failing here.
I tried:
xw = copy.deepcopy(x)
with no success
Any thoughts?
it is x.as_matrix() not df.to_array(x)
Please refer to pandas document for more detail on as_matrix()
Here is the code that work
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
boston = load_boston()
bos = pd.DataFrame(boston.data) # convert to DF
bos.columns = boston.feature_names
bos['PRICE'] = boston.target
y = bos.PRICE
x = bos.drop('PRICE', axis = 1) # DROP PRICE since only want X-type variables (not Y-target)
xw = x.as_matrix()
xw = np.insert(xw,0,1, axis = 1) # to insert a column of "1" values

Categories