I can't get data into excel using this code
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
driver = webdriver.Chrome()
mainurl = 'https://offerup.com/explore/sck/tx/austin/cars-trucks/'
driver.get(mainurl)
res = driver.execute_script("return document.documentElement.outerHTML")
page_soup = BeautifulSoup(res, 'html.parser')
# btn = driver.find_element_by_xpath('//*[#id="react-container"]/div/div[2]/div[2]/div[2]/div[3]/button').click()
records = []
for a in page_soup.find_all('span', class_='_nn5xny4 _y9ev9r'):
title = a.text
print(title)
records.append(title)
prices = []
for b in page_soup.find_all('span', class_='_s3g03e4'):
price = b.text
print(price)
prices.append(price)
location = []
for c in page_soup.find_all('span', class_='_19rx43s2'):
loc = c.text
print(loc)
location.append(loc)
df = pd.DataFrame(records, prices, location)
print(df)
df.to_csv('trymenew.csv')
Traceback (most recent call last):
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\managers.py", line 1654, in create_block_manager_from_blocks
make_block(values=blocks[0], placement=slice(0, len(axes[0])))
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\blocks.py", line 3047, in make_block
return klass(values, ndim=ndim, placement=placement)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\blocks.py", line 2595, in __init__
super().__init__(values, ndim=ndim, placement=placement)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\blocks.py", line 124, in __init__
raise ValueError(
ValueError: Wrong number of items passed 1, placement implies 44
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Saba\Desktop\cars_offerup.py", line 29, in <module>
df = pd.DataFrame(records, prices, location)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\frame.py", line 488, in __init__
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\construction.py", line 210, in init_ndarray
return create_block_manager_from_blocks(block_values, [columns, index])
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\managers.py", line 1664, in create_block_manager_from_blocks
construction_error(tot_items, blocks[0].shape[1:], axes, e)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\managers.py", line 1694, in construction_error
raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
ValueError: Shape of passed values is (44, 1), indices imply (44, 44)
i have this type of error, can you help me solve it?
The pd.DataFrame as we can see in the documentation expects as the first argument the data, but you passed the data as 3 separate arguments.
In order to fix that, we change:
df = pd.DataFrame(records, prices, location) # Wrong
to
df = pd.DataFrame([records, prices, location]) # Correct
Related
I want to do the following:
I have data in long format organized by dates
Sometimes, data is missing as it there is no record of it
I found a solution by interpolating missing data using reindex which works fine when used outside of function, but for some reason, doesn't work when used inside of a function
def sum_customer_portfolio(country, sold_to):
df = pd.merge(etl_customer_portfolio(), etl_week(), how="left", on=["Country", "GCAS"])
df = df.loc[df["Country"].isin(country)]
df = df.loc[df["Sold_to"].isin(sold_to)]
df_week = etl_week()
df_week = df_week.dropna(subset=["Sold_to"])
df_week = df_week[["Week_num", "Date_range"]]
df_week = df_week.drop_duplicates(subset=["Date_range"])
sum_df = pd.merge(df, df_week, how="outer", on=["Week_num", "Date_range"])
sum_df["Stat_unit_qty"] = sum_df["Stat_unit_qty"].fillna(0, axis=0)
sum_df[["Country", "Sold_to", "Customer"]] = sum_df[["Country", "Sold_to", "Customer"]].fillna(method="ffill",
axis=0)
sum_df = sum_df.fillna("DUMMY_NOT_USE").replace("DUMMY_NOT_USE", np.nan)
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
current_date = datetime.now().strftime("%d%m%Y_%H%M%S")
# return sum_df.to_excel(f"CUSTOMER_PORTFOLIO-{current_date}.xlsx", sheet_name="GCAS_SUM", index=False)
return final_df
Code above keeps giving me the following error:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3361, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 103, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 135, in pandas._libs.index.IndexEngine._get_loc_duplicates
File "pandas\_libs\index_class_helper.pxi", line 51, in pandas._libs.index.Float64Engine._maybe_get_bool_indexer
File "pandas\_libs\index.pyx", line 161, in pandas._libs.index.IndexEngine._unpack_bool_indexer
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 167, in <module>
sum_customer_portfolio(country=["Croatia", "Slovenia"], sold_to=[2000829798, 2000558171]).to_excel(writer, index=False, sheet_name="GCAS_SUM")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 113, in sum_customer_portfolio
reindex_subset = (reindex_subset.groupby(["GCAS", "Sold_to"]).apply(
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1253, in apply
result = self._python_apply_general(f, self._selected_obj)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1287, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, data, self.axis)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 783, in apply
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 1328, in fast_apply
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
File "pandas\_libs\reduction.pyx", line 369, in pandas._libs.reduction.apply_frame_axis0
File "pandas\_libs\reduction.pyx", line 428, in pandas._libs.reduction.BlockSlider.__init__
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\frame.py", line 3430, in __getitem__
indexer = convert_to_index_sliceable(self, key)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexing.py", line 2329, in convert_to_index_sliceable
return idx._convert_slice_indexer(key, kind="getitem")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\numeric.py", line 242, in _convert_slice_indexer
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5686, in slice_indexer
start_slice, end_slice = self.slice_locs(start, end, step=step)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5894, in slice_locs
end_slice = self.get_slice_bound(end, "right")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5808, in get_slice_bound
raise err
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5802, in get_slice_bound
slc = self.get_loc(label)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 0
When loading the data directly from Excel (same data that produced by the function), for example, "CUSTOMER_PORTFOLIO-11082021_234057.xlsx" and running the following code:
sum_df = pd.read_excel("CUSTOMER_PORTFOLIO-11082021_234057.xlsx")
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
The code gives me results that I want.
What am I missing? I tried searching for this on SO overflow, but no success as of yet. I have tried resetting index, but unfortunately, it didn't help.
UPDATE: Pasted the full error traceback. Moreover, as I said above, when I run the function without the part of the code that "reindexes" the data, the code works just fine. I have also tried and still no luck:
df_new = df.copy(deep=True)
df_week= df_week.copy(deep=True)
And when I run the "reindex" part of the code on a finished .xlsx, it works just fine, which is strange in itself.
I wanna insert data into a data frame like:
df = pd.DataFrame(columns=["Date", "Title", "Artist"])
insertion happens here:
df.insert(loc=0, column="Date", value=dateTime.group(0), allow_duplicates=True)
df.insert(loc=0, column="Title", value=title, allow_duplicates=True)
df.insert(loc=0, column="Artist", value=artist, allow_duplicates=True)
sadly I don't know how to handle these errors:
Traceback (most recent call last):
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 697, in _try_cast
subarr = maybe_cast_to_datetime(arr, dtype)
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py", line 1067, in maybe_cast_to_datetime
value = maybe_infer_to_datetimelike(value)
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py", line 865, in maybe_infer_to_datetimelike
if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex,
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/dtypes/generic.py", line 9, in _check
return getattr(inst, attr, '_typ') in comp
TypeError: 'in <string>' requires string as left operand, not NoneType
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/sashakaun/IdeaProjects/scrapyscrape/test.py", line 24, in <module>
df.insert(loc=0, column="Title", value=title, allow_duplicates=True)
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/frame.py", line 3470, in insert
self._ensure_valid_index(value)
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/frame.py", line 3424, in _ensure_valid_index
value = Series(value)
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/series.py", line 261, in __init__
data = sanitize_array(data, index, dtype, copy,
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 625, in sanitize_array
subarr = _try_cast(data, False, dtype, copy, raise_cast_failure)
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 720, in _try_cast
subarr = np.array(arr, dtype=object, copy=copy)
File "/Users/sashakaun/IdeaProjects/python/venv/lib/python3.8/site-packages/bs4/element.py", line 971, in __getitem__
return self.attrs[key]
KeyError: 0
its my first question, please be kind,
thanks in advance
The error seems to be from your value=dateTime.group(0) value. Can you elaborate on what is structure of dateTime?
Plus, df.insert() inserts a column rather than adding the data to a dataframe.
You should first transform your data into a series object and then use df.concat() to concatenate with the original dataframe.
Below are some references:
https://kite.com/python/answers/how-to-insert-a-row-into-a-pandas-dataframe
Add one row to pandas DataFrame
https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
When I try to convert some xml to dataframe using xmltodict it happens that a particular column contains all the info I need as dict or list of dict. I'm able to convert this column in multiple ones with pandas but I'm not able to perform the similar operation in dask.
Is not possible to use meta because I've no idea of all the possible fields that are available in the xml and dask is necessary because the true xml files are bigger than 1Gb each.
example.xml:
<?xml version="1.0" encoding="UTF-8"?>
<itemList>
<eventItem uid="1">
<timestamp>2019-07-04T09:57:35.044Z</timestamp>
<eventType>generic</eventType>
<details>
<detail>
<name>columnA</name>
<value>AAA</value>
</detail>
<detail>
<name>columnB</name>
<value>BBB</value>
</detail>
</details>
</eventItem>
<eventItem uid="2">
<timestamp>2019-07-04T09:57:52.188Z</timestamp>
<eventType>generic</eventType>
<details>
<detail>
<name>columnC</name>
<value>CCC</value>
</detail>
</details>
</eventItem>
</itemList>
Working pandas code:
import xmltodict
import collections
import pandas as pd
def pd_output_dict(details):
detail = details.get("detail", [])
ret_value = {}
if type(detail) in (collections.OrderedDict, dict):
ret_value[detail["name"]] = detail["value"]
elif type(detail) == list:
for i in detail:
ret_value[i["name"]] = i["value"]
return pd.Series(ret_value)
with open("example.xml", "r", encoding="utf8") as f:
df_dict_list = xmltodict.parse(f.read()).get("itemList", {}).get("eventItem", [])
df = pd.DataFrame(df_dict_list)
df = pd.concat([df, df.apply(lambda row: pd_output_dict(row.details), axis=1, result_type="expand")], axis=1)
print(df.head())
Not working dask code:
import xmltodict
import collections
import dask
import dask.bag as db
import dask.dataframe as dd
def dd_output_dict(row):
detail = row.get("details", {}).get("detail", [])
ret_value = {}
if type(detail) in (collections.OrderedDict, dict):
row[detail["name"]] = detail["value"]
elif type(detail) == list:
for i in detail:
row[i["name"]] = i["value"]
return row
with open("example.xml", "r", encoding="utf8") as f:
df_dict_list = xmltodict.parse(f.read()).get("itemList", {}).get("eventItem", [])
df_bag = db.from_sequence(df_dict_list)
df = df_bag.to_dataframe()
df = df.apply(lambda row: dd_output_dict(row), axis=1)
The idea is to have in dask similar result I've in pandas but a the moment I'm receiving errors:
>>> df = df.apply(lambda row: output_dict(row), axis=1)
Traceback (most recent call last):
File "C:\Anaconda3\lib\site-packages\dask\dataframe\utils.py", line 169, in raise_on_meta_error
yield
File "C:\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 4711, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
File "C:\Anaconda3\lib\site-packages\dask\utils.py", line 854, in __call__
return getattr(obj, self.method)(*args, **kwargs)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 6487, in apply
return op.get_result()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 151, in get_result
return self.apply_standard()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 257, in apply_standard
self.apply_series_generator()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 286, in apply_series_generator
results[i] = self.f(v)
File "<stdin>", line 1, in <lambda>
File "<stdin>", line 4, in output_dict
AttributeError: ("'str' object has no attribute 'get'", 'occurred at index 0')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 3964, in apply
M.apply, self._meta_nonempty, func, args=args, udf=True, **kwds
File "C:\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 4711, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
File "C:\Anaconda3\lib\contextlib.py", line 130, in __exit__
self.gen.throw(type, value, traceback)
File "C:\Anaconda3\lib\site-packages\dask\dataframe\utils.py", line 190, in raise_on_meta_error
raise ValueError(msg)
ValueError: Metadata inference failed in `apply`.
You have supplied a custom function and Dask is unable to
determine the type of output that that function returns.
To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.
Original error is below:
------------------------
AttributeError("'str' object has no attribute 'get'", 'occurred at index 0')
Traceback:
---------
File "C:\Anaconda3\lib\site-packages\dask\dataframe\utils.py", line 169, in raise_on_meta_error
yield
File "C:\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 4711, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
File "C:\Anaconda3\lib\site-packages\dask\utils.py", line 854, in __call__
return getattr(obj, self.method)(*args, **kwargs)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 6487, in apply
return op.get_result()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 151, in get_result
return self.apply_standard()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 257, in apply_standard
self.apply_series_generator()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 286, in apply_series_generator
results[i] = self.f(v)
File "<stdin>", line 1, in <lambda>
File "<stdin>", line 4, in output_dict
Right, so operations like map_partitions will need to know the column names and data types. As you've mentioned, you can specify this with the meta= keyword.
Perhaps you can run through your data once to compute what these will be, and then construct a proper meta object, and pass that in? This is inefficient, and requires reading through all of your data, but I'm not sure that there is another way.
I want to get live prices of concurrency from rest api of binance.
I am using:
def inCoin(coin):
url = 'https://api.binance.com/api/v3/ticker/price?symbol='+coin+'USDT'
df = pd.read_json(url)
df.columns = ["symbol","price"]
return df
It gives the following error when this function is called:
Traceback (most recent call last):
File "ee2.py", line 201, in <module>
aa = inCoin('BTC')
File "ee2.py", line 145, in inCoin
df = pd.read_json(url, orient='index')
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/io/json/json.py", line 422, in read_json
result = json_reader.read()
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/io/json/json.py", line 529, in read
obj = self._get_object_parser(self.data)
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/io/json/json.py", line 546, in _get_object_parser
obj = FrameParser(json, **kwargs).parse()
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/io/json/json.py", line 638, in parse
self._parse_no_numpy()
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/io/json/json.py", line 861, in _parse_no_numpy
loads(json, precise_float=self.precise_float), dtype=None).T
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 348, in __init__
mgr = self._init_dict(data, index, columns, dtype=dtype)
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 459, in _init_dict
return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 7356, in _arrays_to_mgr
index = extract_index(arrays)
File "/home/hspace/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 7393, in extract_index
raise ValueError('If using all scalar values, you must pass'
ValueError: If using all scalar values, you must pass an index
Previously, I used this function to call historical data from binance api:
def Cryptodata2(symbol,tick_interval='1m'):
url = 'https://api.binance.com/api/v1/klines?symbol='+symbol+'&interval='+tick_interval
df = pd.read_json(url)
df.columns = [ "date","open","high","low","close","volume",
"close time","quote asset volume","number of trades","taker buy base asset volume",
"Taker buy quote asset volume","ignore"]
df['date'] = pd.to_datetime(df['date'],dayfirst=True, unit = 'ms')
df.set_index('date',inplace=True)
del df['ignore']
return df
And this works fluently.
I just want price of that coin and show it as an integer or dataframe from this url:
https://api.binance.com/api/v3/ticker/price?symbol=BTCUSDT
Thanks for helping me.
Also, it would be great if you could provide more detail on debugging such "value" errors.
I am trying to create different python file where the code is given below. While calling the method, I pass the mydata as data frame with these columns
['wage', 'educ', 'exper', 'tenure'].
import pandas as pd
import numpy as np
from prettytable import PrettyTable as pt
def LinearRegressionOLS(mydata,target_column):
if(not isinstance(mydata,pd.DataFrame)):
raise TypeError("Data must be of type Data Frame")
if(not isinstance(target_column,str)):
raise TypeError("target_column must be String")
if(target_column not in mydata.columns):
raise KeyError("target_column doesn't exist in Data Frame")
data=mydata.copy()
data["one"]=np.ones(data.count()[target_column])
column_list=["one"]
for i in data.columns:
column_list.append(i)
Y=data[target_column].as_matrix()
data.drop(target_column,inplace=True,axis=1)
X=data[column_list].as_matrix()
del data
beta = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),Y)
predY = np.matmul(X,beta)
total = np.matmul((Y-np.mean(Y)).T,(Y-np.mean(Y)))
residual = np.matmul((Y-predY).T,(Y-predY))
sigma = np.matmul((Y-predY).T,(Y-predY))/(X.shape[0]-X.shape[1])
omega = np.square(sigma)*np.linalg.inv(np.matmul(X.T,X))
SE = np.sqrt(np.diag(omega))
tstat = beta/SE
Rsq = 1-(residual/total)
final = pt()
final.add_column(" ",column_list)
final.add_column("Coefficients",beta)
final.add_column("Standard Error",SE)
final.add_column("t-stat",tstat)
print(final)
print("Residual: ",residual)
print("Total: ",total)
print("Standard Error: ",sigma)
print("R Square: ",Rsq)
After running the above code, by calling the function given below,
>>> c
['wage', 'educ', 'exper', 'tenure']
>>> import LR_OLS as inf
>>> inf.LinearRegressionOLS(file[c],"wage")
, i get some error like this
Traceback (most recent call last):
File "<pyshell#182>", line 1, in <module>
inf.LinearRegressionOLS(file[c],"wage")
File "E:\python\LR_OLS.py", line 29, in LinearRegressionOLS
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2133, in __getitem__
return self._getitem_array(key)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2177, in _getitem_array
indexer = self.loc._convert_to_indexer(key, axis=1)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\indexing.py", line 1269, in _convert_to_indexer
.format(mask=objarr[mask]))
KeyError: "['wage'] not in index"
Can anyone help me as to why i am getting this error. How can i resolve it?
The problem is that you still have 'wage' in 'column_list. So in order to never let it get in there do the following adaptation:
for i in data.columns:
if i != 'wage': # add this line to your code
column_list.append(i)