Issues with "setting an array element with a sequence" - python
I'm having issues with a "setting an array element with a sequence error.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
df = pd.read_csv('googleplaystore.csv') # 1
df = df.dropna() # 3
df['Size'] = df['Size'].str.extract(r'(\d+\.?\d)', expand=False).astype(float) * df['Size'].str[-1].replace({'M': 1024, 'k': 1}) # 4
df = df.dropna() # remove nan from "Varies with device"
df['Price'] = df['Price'].str.strip('$').astype(float) # 5
df['Installs'] = df['Installs'].str.strip('+')
df['Installs'] = df['Installs'].str.replace(',',"").astype(int)
df['Reviews'] = df['Reviews'].astype(float)
df['Size'] = df['Size'].astype(float)
df = df.loc[df['Rating'].between(1, 5)] # 6
df = df.loc[df['Type'] != 'Free'] # 7
df.drop(df[df['Price'] >= 200].index, inplace = True)
df.drop(df[df['Reviews'] >2000000].index, inplace = True)
df.drop(df[df['Installs'] >10000].index, inplace = True)
inp1 = df.copy()
df_reviewslog=np.log10(df['Reviews'])
df_installslog=np.log10(df['Installs'])
del df['App']
del df['Last Updated']
del df['Current Ver']
del df['Android Ver']
pd.get_dummies(df, columns=['Category', 'Genres', 'Content Rating'], drop_first=True)
inp2 = df.copy()
df_train = X_train,X_test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.7, random_state=0)
df_test = X_train,X_Test,y_train,y_test=train_test_split(df['Reviews'],df['Installs'], test_size=0.3, random_state=0)
df_train = np.array(df_train, dtype=object)
df_test = np.array(df_test, dtype=object)
df_train[0] = np.array([4])
df_test[0] = np.array([4])
df_train_1= df_train.reshape(4,1)
df_test_1= df_test.reshape(4,1)
#df_train_1
#df_test_1
model = LinearRegression().fit(df_train_1, df_test_1)
r_sq = model.score(df_train_1, df_test_1)
print(r_sq)
I keep making adjustments to my arrays to get them to work, but I keep getting this error:
"ValueError: setting an array element with a sequence." I can't figure out how to change it to get it to work.
Related
Converting strings to numpy arrays to run a for loop on dataframe linear regression
import datetime import numpy as np from sklearn.linear_model import LinearRegression import os os.chdir(r'path') df = pd.read_excel("test.xlsx") print(df) def process_date(date_str): # the date format is month-day-year d = datetime.datetime.strptime(date_str, '%m-%d-%Y') return d.timestamp() df['Date'] = df['Date'].apply(process_date) df.head() for (columnName, columnData) in df.iteritems(): cn = columnName cd = columnData print('Column Name : ', columnName) print('Column Contents : ', columnData.values) X = np.array(cn).reshape(-1,1) y = cd.to_numpy() model = LinearRegression() model.fit(X, y) ValueError: Unable to convert array of bytes/strings into decimal numbers with dtype='numeric' Here is my dataset
cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid
My input data is under the form: gold,Program,MethodType,CallersT,CallersN,CallersU,CallersCallersT,CallersCallersN,CallersCallersU,CalleesT,CalleesN,CalleesU,CalleesCalleesT,CalleesCalleesN,CalleesCalleesU,CompleteCallersCallees,classGold T,chess,Inner,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,-1,Low,1,Trace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,Trace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,High,-1,-1,-1,Low,1,NoTrace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,Trace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,Trace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace, T,chess,Inner,Low,-1,-1,Low,-1,-1,Medium,-1,Medium,High,-1,High,0,Trace, T,chess,Inner,Low,-1,-1,Low,-1,-1,Medium,-1,Medium,High,-1,High,0,Trace, N,chess,Inner,-1,Low,-1,-1,Low,-1,-1,Medium,Medium,-1,High,High,0,NoTrace, N,chess,Inner,-1,Low,-1,-1,-1,-1,Low,Low,High,Medium,-1,Medium,0,Trace, N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,Low,Low,Medium,0,NoTrace, N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace, T,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,Low,Low,Medium,0,Trace, N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace, N,chess,Inner,-1,Low,-1,-1,-1,-1,Low,Low,High,Low,Low,Medium,0,Trace, N,chess,Inner,Low,-1,-1,-1,-1,-1,Low,Low,High,Low,Low,Medium,0,Trace, N,chess,Inner,-1,Low,-1,-1,-1,-1,-1,Medium,High,-1,Medium,Medium,0,NoTrace, .... N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,Trace, N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,NoTrace, T,chess,Inner,Low,-1,-1,Low,Low,-1,Low,-1,Low,-1,-1,-1,0,Trace, T,chess,Inner,Low,-1,-1,Medium,-1,-1,Low,-1,Low,-1,-1,-1,0,Trace, N,chess,Inner,-1,Low,-1,-1,Medium,-1,-1,Low,Low,-1,-1,-1,0,NoTrace, I am reading my data and I am trying to concatenate two data sets that are subsets of the original data set, here is the code I am using: import pandas as pd import numpy as np from sklearn.feature_selection import SelectFromModel from sklearn.model_selection import train_test_split # Feature Scaling from sklearn.preprocessing import StandardScaler SeparateProjectLearning=False CompleteCallersCallees=False PartialTrainingSetCompleteCallersCallees=True def main(): X_train={} X_test={} y_train={} y_test={} dataset = pd.read_csv( 'InputData.txt', sep= ',', index_col=False) #convert T into 1 and N into 0 dataset['gold'] = dataset['gold'].astype('category').cat.codes dataset['Program'] = dataset['Program'].astype('category').cat.codes dataset['classGold'] = dataset['classGold'].astype('category').cat.codes dataset['MethodType'] = dataset['MethodType'].astype('category').cat.codes dataset['CallersT'] = dataset['CallersT'].astype('category').cat.codes dataset['CallersN'] = dataset['CallersN'].astype('category').cat.codes dataset['CallersU'] = dataset['CallersU'].astype('category').cat.codes dataset['CallersCallersT'] = dataset['CallersCallersT'].astype('category').cat.codes dataset['CallersCallersN'] = dataset['CallersCallersN'].astype('category').cat.codes dataset['CallersCallersU'] = dataset['CallersCallersU'].astype('category').cat.codes dataset['CalleesT'] = dataset['CalleesT'].astype('category').cat.codes dataset['CalleesN'] = dataset['CalleesN'].astype('category').cat.codes dataset['CalleesU'] = dataset['CalleesU'].astype('category').cat.codes dataset['CalleesCalleesT'] = dataset['CalleesCalleesT'].astype('category').cat.codes dataset['CalleesCalleesN'] = dataset['CalleesCalleesN'].astype('category').cat.codes dataset['CalleesCalleesU'] = dataset['CalleesCalleesU'].astype('category').cat.codes pd.set_option('display.max_columns', None) row_count, column_count = dataset.shape Xcol = dataset.iloc[:, 1:column_count] CompleteSet=dataset.loc[dataset['CompleteCallersCallees'] == 1] CompleteSet_X = CompleteSet.iloc[:, 1:column_count].values CompleteSet_Y = CompleteSet.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(CompleteSet_X, CompleteSet_Y, test_size = 0.2, random_state = 0) TestSet=dataset.loc[dataset['CompleteCallersCallees'] == 0] X_test1=TestSet.iloc[:, 1:column_count].values X_test=pd.concat(X_test1,X_test) I want to build my own test set and training set by using concatenation and I am trying to concatenate X_test1 and X_test in the code above. However, the problem is that I am getting an error for the last line of code X_test=pd.concat(X_test1,X_test) and the error says TypeError: cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid. How can I fix this?
By adding .values to the end of your filters in the following lines: CompleteSet_X = CompleteSet.iloc[:, 1:column_count].values CompleteSet_Y = CompleteSet.iloc[:, 0].values X_test1=TestSet.iloc[:, 1:column_count].values You are extracting the underlying Numpy ndarray from the Pandas Series/DataFrame the prior code extracts, just remove .values at the end and you can use concat directly with the Series or DataFrame.
loop through dataframe columns to do simple linear regression?
import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression df = pd.read_excel("Book1.xlsx") for column in df: X = df["Row Labels"] Y = df[column] y1 =Y.values.reshape(-1,1) x1 =X.values.reshape(-1,1) regressor = LinearRegression() regressor.fit(x1, y1) y_new = [] y_i = [] for i in range(12,24): y_new.append(regressor.predict([[i]])) y_i.append(i) df2 = pd.DataFrame({'column':y_new}) i write this code to loop through the dataframe columns to do simple linear regression and put all the predicted value in dataframe. but it is predicting only the last columns value.
df2 = pd.DataFrame({'column':y_new}) creates a column named 'column' verbatim (not the name saved in the variable column. Moreover, df2 is recreated in every iteration, each iteration it only saves the last y_new. I think what you want is to create a new column in df2 in each iteration: import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression df = pd.read_excel("Book1.xlsx") df2 = pd.DataFrame() for column in df: X = df["Row Labels"] Y = df[column] y1 =Y.values.reshape(-1,1) x1 =X.values.reshape(-1,1) regressor = LinearRegression() regressor.fit(x1, y1) y_new = [] y_i = [] for i in range(12,24): y_new.append(regressor.predict([[i]])) y_i.append(i) df2[column] = y_new
Summarize Loop Results in Pandas Table
I got the code that downloads tickers and runs the linear regression for each stock in the downloaded list. I am stuck on the last step: showing Prediction & Residual values for each stock, for the last date in the data. import pandas as pd import numpy as np import yfinance as yf import datetime as dt from sklearn import linear_model tickers = ['EXPE','MSFT'] data = yf.download(tickers, start="2012-04-03", end="2017-07-07")['Close'] data = data.reset_index() data = data.dropna() df = pd.DataFrame(data, columns = ["Date"]) df["Date"]=df["Date"].apply(lambda x: x.toordinal()) for ticker in tickers: data[ticker] = pd.DataFrame(data, columns = [ticker]) X = df y = data[ticker] lm = linear_model.LinearRegression() model = lm.fit(X,y) predictions = lm.predict(X) residuals = y-lm.predict(X) print (predictions[-1:]) print(residuals[-1:]) The current output looks like this: [136.28856636] 1323 13.491432 Name: EXPE, dtype: float64 [64.19943648] 1323 5.260563 Name: MSFT, dtype: float64 But I would like it to show like this (as pandas table): Predictions Residuals EXPE 136.29 13.49 MSFT 64.20 5.26
You could do something like this where you store values in a list: import pandas as pd import numpy as np import yfinance as yf import datetime as dt from sklearn import linear_model tickers = ['EXPE','MSFT'] data = yf.download(tickers, start="2012-04-03", end="2017-07-07")['Close'] data = data.reset_index() data = data.dropna() df = pd.DataFrame(data, columns = ["Date"]) df["Date"]=df["Date"].apply(lambda x: x.toordinal()) predictions_output = [] residuals_output = [] for ticker in tickers: data[ticker] = pd.DataFrame(data, columns = [ticker]) X = df y = data[ticker] lm = linear_model.LinearRegression() model = lm.fit(X,y) predictions = lm.predict(X) residuals = y-lm.predict(X) predictions_output.append(float(predictions[-1:])) residuals_output.append(float(residuals[-1:])) expectation_df = pd.DataFrame(list(zip(predictions_output, residuals_output)), columns =['Predictions', 'Residuals']).set_index([tickers]) print(expectation_df) with the output being: Predictions Residuals EXPE 136.288566 13.491432 MSFT 64.199436 5.260563 EDIT: I went too quickly and looked back and realized tickers was already defined, so you can use that to set your index here and lose the Tickers index heading to match your desired output. Also if you want those values rounded, you can just append these two lines in your loop: predictions_output.append(round(float(predictions[-1:]), 2)) residuals_output.append(round(float(residuals[-1:]), 2))
Convert DF into Numpy Array for calculations
I have the data in a dataframe format that I will use for linear regression calculation using user-built function. Here is the code: from sklearn.datasets import load_boston boston = load_boston() bos = pd.DataFrame(boston.data) # convert to DF bos.columns = boston.feature_names bos['PRICE'] = boston.target y = bos.PRICE x = bos.drop('PRICE', axis = 1) # DROP PRICE since only want X-type variables (not Y-target) xw = df.to_array(x) xw = np.insert(xw,0,1, axis = 1) # to insert a column of "1" values However, I am getting the error: AttributeError Traceback (most recent call last) <ipython-input-131-272f1b4d26ba> in <module>() 1 import copy 2 ----> 3 xw = df.to_array(x) AttributeError: 'int' object has no attribute 'to_array' I am not sure where the problem. I need to pass an array of values (x in this case) to the function to execute some matrix operations The insert function was working in a step by step code development but for some reason is failing here. I tried: xw = copy.deepcopy(x) with no success Any thoughts?
it is x.as_matrix() not df.to_array(x) Please refer to pandas document for more detail on as_matrix() Here is the code that work from sklearn.datasets import load_boston import pandas as pd import numpy as np boston = load_boston() bos = pd.DataFrame(boston.data) # convert to DF bos.columns = boston.feature_names bos['PRICE'] = boston.target y = bos.PRICE x = bos.drop('PRICE', axis = 1) # DROP PRICE since only want X-type variables (not Y-target) xw = x.as_matrix() xw = np.insert(xw,0,1, axis = 1) # to insert a column of "1" values