Value Error Mismatch While Converting Using Pandas - python

here is the mismatch error I keep getting. I'm inputting "202710".
Traceback (most recent call last):
File "nbastatsrecieveit.py", line 29, in <module>
df.columns = headers
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\generic.py", line 5149, in __setattr__
return object.__setattr__(self, name, value)
File "pandas\_libs\properties.pyx", line 66, in pandas._libs.properties.AxisProperty.__set__
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\generic.py", line 564, in _set_axis
self._mgr.set_axis(axis, labels)
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\internals\managers.py", line 226, in set_axis
raise ValueError(
ValueError: Length mismatch: Expected axis has 0 elements, new values have 24 elements
To be honest, I'm not sure as to how to go about fixing this problem as it works with specific player IDs but not all of then. Here is the rest of my code:
from nba_api.stats.endpoints import shotchartdetail
import pandas as pd
import json
from openpyxl import Workbook
print('Player ID?')
playerid = input()
filename = str(playerid) + '.xlsx'
response = shotchartdetail.ShotChartDetail(
team_id= 0,
context_measure_simple = 'FGA',
#last_n_games = numGames,
game_id_nullable = '0041900403',
player_id= playerid
)
content = json.loads(response.get_json())
# transform contents into dataframe
results = content['resultSets'][0]
headers = results['headers']
rows = results['rowSet']
#df = pd.DataFrame(rows)
df = pd.DataFrame(rows)
df.columns = headers
# write to excel file
df.to_excel(filename, index=False)

This is because your df is empty for ID 202710. Exception handling will resolve the issue here-
df = pd.DataFrame(rows)
try:
df.columns = headers
except:
pass

Related

string.split() giving memory error in pandas dataframe

I am trying to split string but getting memory error. Is there any way to solve this or alternative solution for this?
I am getting error below code -
content_str = str(content_str).split('\n')
df1 = pd.DataFrame(content_str)
df1 = df1[0].str.split(',', expand=True)
Error-
Traceback (most recent call last):
File "ravi_sir.py", line 47, in <module>
df1 = df1[0].str.split(',', expand=True)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2001, in wrapper
return func(self, *args, **kwargs)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2690, in split
return self._wrap_result(result, expand=expand, returns_string=expand)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2272, in _wrap_result
result = cons(result, columns=name, index=index, dtype=dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/frame.py", line 520, in __init__
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/construction.py", line 93, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1650, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1739, in form_blocks
object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1784, in _simple_blockify
values, placement = _stack_arrays(tuples, dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1830, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError
I am trying to read zip file from s3 bucket and saving the content into dataframe to get the total count of files inside that zip file. Creating the dataframe. My full code is given below-
list_table = []
for table in d:
dict_table = OrderedDict()
s_time = datetime.datetime.now().strftime("%H:%M:%S")
print("start_time--->>",s_time)
print("tablename--->>", table)
s3 = boto3.resource('s3')
key='raw/vs-1/load-1619/data' +'/'+ table
obj = s3.Object('********',key)
n = obj.get()['Body'].read()
gzipfile = BytesIO(n)
gzipfile = gzip.GzipFile(fileobj=gzipfile)
content = gzipfile.read()
#print(content)
content_str = content.decode('utf-8')
content_str = str(content_str).split('\n')
df1 = pd.DataFrame(content_str)
df1 = df1[0].str.split(',', expand=True)
#df1 = pd.DataFrame([x.split(',') for x in str(content_str).split('\n')])
#print(df1)
#count = os.popen('aws s3 cp s3://itx-agu-lake/raw/vs-1/load-1619/data/{0} - | wc -l'.format(table)).read()
count = int(len(df1)) - 2
del(df1)
e_time = datetime.datetime.now().strftime("%H:%M:%S")
print("End_time---->>",e_time)
print(count)
dict_table['Table_Name'] = str(table)
dict_table['Count'] = count
list_table.append(dict_table)
Since you are splitting a huge string using a df column, then deleting the df, looks like you only need the count of commas for each row. So get the count, which is simple, rather than splitting the df -- which could generate a huge amount of columns and therefore cause your memory error.
row1list = ['1,2,3,4']
row2list = ['5,6']
row3list = ['7,8,9']
df = pd.DataFrame([row1list, row2list, row3list], columns=['col'])
df['count_commas'] = df['col'].str.count(',')
print(df)
# col count_commas
# 0 1,2,3,4 3
# 1 5,6 1
# 2 7,8,9 2

Problems with CSV | Stock Price Manipulation

everyone! I'm going through this course and am having issues. The line I'm having problems with is
df[f'{ticker}_{i}d'] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
You can find this in the def process_data_for_labels(ticker): function. Can anyone tell me what's going on? I copied his code exactly and am getting the same error.
import bs4 as bs
import requests
import pickle
import datetime as dt
import os
import pandas as pd
import pandas_datareader. data as web
import time
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
from collections import Counter
style.use('dark_background')
def save_sp500_tickers():
resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class':'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[0].text
tickers.append(ticker.rstrip())
with open("sp500tickers.pickle", "wb") as f:
pickle.dump(tickers, f)
print(tickers)
return tickers
#save_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
if reload_sp500:
tickers = save_sp500_tickers()
else:
with open("sp500tickers.pickle", "rb") as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
start = dt.datetime(2015, 1, 1)
end = dt.datetime(2020, 7, 1)
for ticker in tickers:
if not os.path.exists('stock_dfs/{ticker}.csv'):
if '.' in ticker:
ticker = ticker.replace('.', '-')
time.sleep(1)
print(ticker)
df = web.DataReader(ticker, 'yahoo', start, end)
df.to_csv('stock_dfs/{}.csv'.format(ticker))
else:
print(f'Already have {ticker}')
#get_data_from_yahoo()
def compile_data():
with open("sp500tickers.pickle","rb") as f:
tickers = pickle.load(f)
main_df = pd.DataFrame()
for count,ticker in enumerate(tickers):
if '.' in ticker:
ticker = ticker.replace('.', '-')
df = pd.read_csv(f'stock_dfs/{ticker}.csv')
df.set_index('Date', inplace=True)
df.rename(columns={'Adj Close':ticker}, inplace=True)
df.drop(['Open','High','Low','Close','Volume'],1,inplace=True)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df, how='outer')
if count % 10 == 0:
print(count)
print(main_df.head())
main_df.to_csv('sp500_joined_closes.csv')
#compile_data()
def visualize_data():
df = pd.read_csv('sp500_joined_closes.csv')
#df['AAPL'].plot()
#plt.show()
df_corr = df.corr()
print(df_corr.head())
data = df_corr.values
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
fig.colorbar(heatmap)
ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)
ax.invert_yaxis()
ax.xaxis.tick_top()
column_labels = df_corr.columns
row_labels = df_corr.index
ax.set_xticklabels(column_labels)
ax.set_yticklabels(row_labels)
plt.xticks(rotation=90)
heatmap.set_clim(-1, 1)
plt.tight_layout()
plt.show()
#visualize_data()
# Machine Learning
def process_data_for_labels(ticker):
hm_days = 7
df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
tickers = df.columns.values.tolist()
df.fillna(0,inplace=True)
for i in range(1, hm_days+1):
df[f'{ticker}_{i}d'] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
df.fillna(0, inplace=True)
return tickers, df
def buy_sell_hold(*args):
cols = [c for c in args]
requirement = 0.2
for col in cols:
if col > requirement:
return 1
if col < -requirement:
return -1
return 0
def extract_featuresets(ticker):
tickers, df = process_data_for_labels(ticker)
end = [eval(f"df[f'{ticker}_{i}']") for i in range(1, 8)]
df[f'{ticker}_target'] = list(map(
buy_sell_hold,
[exec(f"df[f'{ticker}_{i}']") for i in range(1, 8)]
))
vals = df[f'{ticker}_target'].values.tolist()
str_vals = [str(i) for i in vals]
print('Data spread: ', Counter(str_vals))
df.fillna(0, inplace=True)
df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)
df_vals = df[[ticker for ticker in tickers]].pct_change()
df_vals = df_vals.replace([np.inf, -np.inf], 0)
df_vals.fillna(0, inplace=True)
X = df_vals.values
y = df['{ticker}_target'].values
return X,y,df
extract_featuresets('APPL')
Error:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2646, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1618, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1626, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'APPL'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "test.py", line 176, in <module>
extract_featuresets('APPL')
File "test.py", line 152, in extract_featuresets
tickers, df = process_data_for_labels(ticker)
File "test.py", line 132, in process_data_for_labels
df[f'{ticker}_{i}d'] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1618, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1626, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'APPL'
You've identified correctly where the problem begins but you need to continue to follow the breadcrumbs.
The error says "KeyError: 'APPL'", where 'APPL' is one of the stock tickers and are column names / keys to your stock price dataframe 'df' (at least the program expects that). However in this instance, 'df' doesn't contain the key / header of 'APPL'. Maybe something went wrong when loading the data where 'pd.read_csv' is used to read the CSV file? Or maybe the file itself is missing data?
Try open a python terminal and simply load the CSV file, is it what you (or the program) would expect?
Keep digging!

How to resolve "ValueError: Wrong number of items passed"?

I can't get data into excel using this code
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
driver = webdriver.Chrome()
mainurl = 'https://offerup.com/explore/sck/tx/austin/cars-trucks/'
driver.get(mainurl)
res = driver.execute_script("return document.documentElement.outerHTML")
page_soup = BeautifulSoup(res, 'html.parser')
# btn = driver.find_element_by_xpath('//*[#id="react-container"]/div/div[2]/div[2]/div[2]/div[3]/button').click()
records = []
for a in page_soup.find_all('span', class_='_nn5xny4 _y9ev9r'):
title = a.text
print(title)
records.append(title)
prices = []
for b in page_soup.find_all('span', class_='_s3g03e4'):
price = b.text
print(price)
prices.append(price)
location = []
for c in page_soup.find_all('span', class_='_19rx43s2'):
loc = c.text
print(loc)
location.append(loc)
df = pd.DataFrame(records, prices, location)
print(df)
df.to_csv('trymenew.csv')
Traceback (most recent call last):
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\managers.py", line 1654, in create_block_manager_from_blocks
make_block(values=blocks[0], placement=slice(0, len(axes[0])))
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\blocks.py", line 3047, in make_block
return klass(values, ndim=ndim, placement=placement)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\blocks.py", line 2595, in __init__
super().__init__(values, ndim=ndim, placement=placement)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\blocks.py", line 124, in __init__
raise ValueError(
ValueError: Wrong number of items passed 1, placement implies 44
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Saba\Desktop\cars_offerup.py", line 29, in <module>
df = pd.DataFrame(records, prices, location)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\frame.py", line 488, in __init__
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\construction.py", line 210, in init_ndarray
return create_block_manager_from_blocks(block_values, [columns, index])
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\managers.py", line 1664, in create_block_manager_from_blocks
construction_error(tot_items, blocks[0].shape[1:], axes, e)
File "C:\Users\Saba\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\internals\managers.py", line 1694, in construction_error
raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
ValueError: Shape of passed values is (44, 1), indices imply (44, 44)
i have this type of error, can you help me solve it?
The pd.DataFrame as we can see in the documentation expects as the first argument the data, but you passed the data as 3 separate arguments.
In order to fix that, we change:
df = pd.DataFrame(records, prices, location) # Wrong
to
df = pd.DataFrame([records, prices, location]) # Correct

Python gives KeyError while the key is passed

I am trying to create different python file where the code is given below. While calling the method, I pass the mydata as data frame with these columns
['wage', 'educ', 'exper', 'tenure'].
import pandas as pd
import numpy as np
from prettytable import PrettyTable as pt
def LinearRegressionOLS(mydata,target_column):
if(not isinstance(mydata,pd.DataFrame)):
raise TypeError("Data must be of type Data Frame")
if(not isinstance(target_column,str)):
raise TypeError("target_column must be String")
if(target_column not in mydata.columns):
raise KeyError("target_column doesn't exist in Data Frame")
data=mydata.copy()
data["one"]=np.ones(data.count()[target_column])
column_list=["one"]
for i in data.columns:
column_list.append(i)
Y=data[target_column].as_matrix()
data.drop(target_column,inplace=True,axis=1)
X=data[column_list].as_matrix()
del data
beta = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),Y)
predY = np.matmul(X,beta)
total = np.matmul((Y-np.mean(Y)).T,(Y-np.mean(Y)))
residual = np.matmul((Y-predY).T,(Y-predY))
sigma = np.matmul((Y-predY).T,(Y-predY))/(X.shape[0]-X.shape[1])
omega = np.square(sigma)*np.linalg.inv(np.matmul(X.T,X))
SE = np.sqrt(np.diag(omega))
tstat = beta/SE
Rsq = 1-(residual/total)
final = pt()
final.add_column(" ",column_list)
final.add_column("Coefficients",beta)
final.add_column("Standard Error",SE)
final.add_column("t-stat",tstat)
print(final)
print("Residual: ",residual)
print("Total: ",total)
print("Standard Error: ",sigma)
print("R Square: ",Rsq)
After running the above code, by calling the function given below,
>>> c
['wage', 'educ', 'exper', 'tenure']
>>> import LR_OLS as inf
>>> inf.LinearRegressionOLS(file[c],"wage")
, i get some error like this
Traceback (most recent call last):
File "<pyshell#182>", line 1, in <module>
inf.LinearRegressionOLS(file[c],"wage")
File "E:\python\LR_OLS.py", line 29, in LinearRegressionOLS
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2133, in __getitem__
return self._getitem_array(key)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2177, in _getitem_array
indexer = self.loc._convert_to_indexer(key, axis=1)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\indexing.py", line 1269, in _convert_to_indexer
.format(mask=objarr[mask]))
KeyError: "['wage'] not in index"
Can anyone help me as to why i am getting this error. How can i resolve it?
The problem is that you still have 'wage' in 'column_list. So in order to never let it get in there do the following adaptation:
for i in data.columns:
if i != 'wage': # add this line to your code
column_list.append(i)

How to merge python dictionary and create new key for the intersections

I have a dictionary of python dataframe called df. I want to split each dataframe based on gap threshold of 4.5 on the time_epoch column and then merge all the result as a single collection.
From the this question and this question, I came up with following code but I get an error:
keys= df.keys()
all = Counter()
for key in keys:
ids = (df[key]['time_epoch'] > (df[key]['time_epoch'].shift() + 4.5)).cumsum()
gp= df[key].groupby(ids)
all.update(Counter(dict(list(gp))))
I get the following error:
Traceback (most recent call last):
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\ops.py", line 1176, in na_op
raise_on_error=True, **eval_kwargs)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\computation\expressions.py", line 211, in evaluate
**eval_kwargs)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\computation\expressions.py", line 64, in _evaluate_standard
return op(a, b)
TypeError: must be str, not int
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\internals.py", line 1184, in eval
result = get_result(other)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\internals.py", line 1153, in get_result
result = func(values, other)
File "C:\Users\...\Miniconda3\lib\site-packages\pandas\core\ops.py", line 1202, in na_op
result[mask] = op(xrav, y)
TypeError: must be str, not int
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/code.py", line 53, in <module>
function()
File "D:/code.py", line 41, in function
all.update(Counter(dict(list(flow_key))))
Edit1
My df is created as follow:
dftemp = pd.read_csv(
"traffic.csv",
skipinitialspace=True,
usecols=[
'time_epoch', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport',
'frame.len', 'tcp.flags', 'Protocol',
],
na_filter=False,
encoding="utf-8")
complete = pd.read_csv(
"traffic.csv",
skipinitialspace=True,
usecols=[
'frame.time_epoch', 'ip.src', 'ip.dst', 'tcp.srcport',
'tcp.dstport', 'frame.len', 'tcp.flags', 'Protocol',
],
na_filter=False,
encoding="utf-8")
complete.loc[(complete['ip.dst'] == hostip[i]), 'frame.len'] = complete['frame.len'] * -1
complete.loc[(complete['frame.len'] < 0), 'ip.src'] = dftemp['ip.dst']
complete.loc[(complete['frame.len'] < 0), 'ip.dst'] = dftemp['ip.src']
complete.loc[(complete['frame.len'] < 0), 'tcp.srcport'] = dftemp['tcp.dstport']
complete.loc[(complete['frame.len'] < 0), 'tcp.dstport'] = dftemp['tcp.srcport']
complete_flow = complete.groupby(
['ip.src','ip.dst','tcp.srcport','tcp.dstport','Protocol'])
df = dict(list(complete_flow))
df contains network traffic flows, which I want to split each flow using a threshold on packets timestamp gap.
Edit2
I find that counter only keep count of each key, so I iterate over new dictionary and create unique key for each, is there a pythonic way of doing this?
flows = {}
i = 1
for key in keys:
i += 1
flow_ids = (df[key]['time_epoch'] > (df[key]['time_epoch'].shift() + 4.5)).cumsum()
gp = df[key].groupby(ids)
df2 = dict(list(gp))
keys2 = df2.keys()
for i in keys2:
flows["%s, %s" % (key,i)] = df2[i]
del df2

Categories