Is there any solution to "Result too large" in python? - python

This is a programm in python, using the libray pandas, to assign different values to a new column depending on the values of the rest of the data. It works with small data but for some reason, when i try to use the programm with big data, it fails with an error. I tried the library decimal but it does not work. I think the error is related with the size of the float, but I can't find a way to fix it. Thanks
Error:
runfile('C:/Users/Usuario/Documents/DOCUMENTOS Y DATOS/ESTUDIO/4. TFM/Task 1/TASK SIN FILTRADO/TASK 3.4 sf.py', wdir='C:/Users/Usuario/Documents/DOCUMENTOS Y DATOS/ESTUDIO/4. TFM/Task 1/TASK SIN FILTRADO')
Traceback (most recent call last):
File "<ipython-input-34-37348501d189>", line 1, in <module>
runfile('C:/Users/Usuario/Documents/DOCUMENTOS Y DATOS/ESTUDIO/4. TFM/Task 1/TASK SIN FILTRADO/TASK 3.4 sf.py', wdir='C:/Users/Usuario/Documents/DOCUMENTOS Y DATOS/ESTUDIO/4. TFM/Task 1/TASK SIN FILTRADO')
File "C:\Users\Usuario\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\Users\Usuario\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Usuario/Documents/DOCUMENTOS Y DATOS/ESTUDIO/4. TFM/Task 1/TASK SIN FILTRADO/TASK 3.4 sf.py", line 76, in <module>
table["Flow_pattern"] = table.apply(lambda x: flow_pattern(x["Fcoef"], x["Vcoef"]), axis=1)
File "C:\Users\Usuario\Anaconda3\lib\site-packages\pandas\core\frame.py", line 6913, in apply
return op.get_result()
File "C:\Users\Usuario\Anaconda3\lib\site-packages\pandas\core\apply.py", line 186, in get_result
return self.apply_standard()
File "C:\Users\Usuario\Anaconda3\lib\site-packages\pandas\core\apply.py", line 292, in apply_standard
self.apply_series_generator()
File "C:\Users\Usuario\Anaconda3\lib\site-packages\pandas\core\apply.py", line 321, in apply_series_generator
results[i] = self.f(v)
File "C:/Users/Usuario/Documents/DOCUMENTOS Y DATOS/ESTUDIO/4. TFM/Task 1/TASK SIN FILTRADO/TASK 3.4 sf.py", line 76, in <lambda>
table["Flow_pattern"] = table.apply(lambda x: flow_pattern(x["Fcoef"], x["Vcoef"]), axis=1)
File "C:/Users/Usuario/Documents/DOCUMENTOS Y DATOS/ESTUDIO/4. TFM/Task 1/TASK SIN FILTRADO/TASK 3.4 sf.py", line 35, in flow_pattern
fun=e**((Vcoef+59627)/8432.5)
OverflowError: (34, 'Result too large', 'occurred at index 206')
CODE:
import pandas as pd
e=2.71828182845904523536
table = pd.read_csv('Coef_Data_sf.csv', sep = ',', header = 0)
flowmap = pd.read_csv('flow_map.csv', sep = ',', header = 1)
flowmaptag = pd.read_csv('flow_map.csv', sep = ',', header = 0)
986
# X - F COEF Y - V COEF
# ANNU WISPY y = 8432,5*ln(x) - 59627
# CHURN SLUG y = 9,604ln(x) - 29,739
# BUbblY SLUG y = 462514x-1,29
def flow_pattern(Fcoef, Vcoef):
if Vcoef>100:
fun=e**((Vcoef+59627)/8432.5)
if Fcoef<fun:
return "Annular"
elif Fcoef>fun:
return "Wispy_annular"
else:
return "error1"
elif 48<Vcoef<100:
if Fcoef<5070:
return "Churn"
elif Fcoef>5070:
return "Bubbly"
elif 10<Vcoef<48:
fun1=e**((Vcoef+29.739)/9.604)
if Fcoef<fun1:
return "Churn"
elif fun1<Fcoef<5070:
return "Slug"
elif 5070<Fcoef:
return "Bubbly"
else:
return "error2"
elif Vcoef<10:
fun1=e**((Vcoef+29.739)/9.604)
fun2=e**((Vcoef+1.29)/462514)
if Fcoef<fun1:
return "Churn"
elif fun1<Fcoef<fun2:
return "Slug"
elif fun2<Fcoef:
return "Bubbly"
else:
return "error3"
else:
return "errorfin"
table["Flow_pattern"] = table.apply(lambda x: flow_pattern(x["Fcoef"], x["Vcoef"]), axis=1)
table.to_csv('Data_with_flowpattern_sf.csv')

Related

MemoryError when running python script on google cloud

I am trying to use the Google cloud to run a script that makes predictions for every line of a test.csv file. I use the cloud because it looks like Google Colab is going to take some time. However, when I run it there is a memory error:
(pre_env) mikempc3#instance-1:~$ python predictSales.py
Traceback (most recent call last):
File "predictSales.py", line 7, in <module>
sales = pd.read_csv("sales_train.csv")
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 463, in _read
data = parser.read(nrows)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 1169, in read
df = DataFrame(col_dict, columns=columns, index=index)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/frame.py", line 411, in __init__
mgr = init_dict(data, index, columns, dtype=dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/construction.py", line 257, in init_dict
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/construction.py", line 87, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1694, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1764, in form_blocks
int_blocks = _multi_blockify(items_dict["IntBlock"])
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1846, in _multi_blockify
values, placement = _stack_arrays(list(tup_block), dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1874, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError: Unable to allocate 67.2 MiB for an array with shape (3, 2935849) and data type int64
Here is my script:
import statsmodels.tsa.arima.model as smt
import pandas as pd
import datetime
import numpy as np
sales = pd.read_csv("sales_train.csv")
test = pd.read_csv("test.csv")
sales.date = sales.date.apply(lambda x: datetime.datetime.strptime(x, "%d.%m.%Y"))
sales_monthly = sales.groupby(
["date_block_num", "shop_id", "item_id"])["date", "item_price",
"item_cnt_day"].agg({
"date": ["min", "max"],
"item_price": "mean",
"item_cnt_day": "sum"})
array = []
for i, row in test.iterrows():
print("row['shop_id']: ", row['shop_id'], " row['item_id']: ", row['item_id'])
print(statsmodels.__version__)
ts = pd.DataFrame(sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']], [row['item_id']]], :]['item_price'].values *
sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']], [row['item_id']]], :][
'item_cnt_day'].values).T.iloc[0]
print(ts.values)
if ts.values != [] and len(ts.values) > 2:
best_aic = np.inf
best_order = None
best_model = None
ranges = range(1, 5)
for difference in ranges:
# try:
tmp_model = smt.ARIMA(ts.values, order=(0, 1, 0), trend='t').fit()
tmp_aic = tmp_model.aic
if tmp_aic < best_aic:
best_aic = tmp_aic
best_difference = difference
best_model = tmp_model
# except Exception as e:
# print(e)
# continue
if best_model is not None:
y_hat = best_model.forecast()[0]
if y_hat < 0:
y_hat = 0
else:
y_hat = 0
else:
y_hat = 0
print("predicted:", y_hat)
d = {'id': row['ID'], 'item_cnt_month': y_hat}
array.append(d)
print("-------------------")
df = pd.DataFrame(array)
df.to_csv("submission.csv")
You can use the Fil memory profiler (https://pythonspeed.com/fil) to figure out which lines of code are responsible for peak memory use. It will also handle out-of-memory conditions and dump a report when you run out.
Only caveat is (1) it require Python 3.6 or later and (2) will only run on Linux or macOS. We're up to 3.9 so probably time to upgrade regardless.

Python vestigial parameter and uncallable function

import numpy as np
from scipy.optimize import fsolve
from scipy.integrate import quad
import matplotlib.pyplot as plt
Rgas = 8.31446261815324 #Pa*m**3/mol*K
def Peng_Robinson_EOS(P,V,T,Tc,Pc,ω):
a = (1+(0.37464+1.54226*ω-0.26992*ω**2)*(1-(T/Tc)**(1/2)))**2*Rgas**2*Tc**2/Pc #Pa*m**3
b = 0.07780 * Rgas*Tc/Pc
return P + a/((V+(1-np.sqrt(2))*b)*(V+(1+np.sqrt(2)))) - Rgas*T/(V-b)
def PR_Psat(T,Tc,Pc,ω,V,Pguess = 100000):
def integral_diff (Pguess,T,Tc,Pc,ω,V):
def Psat_integrand (V,Pguess,T,Tc,Pc,ω):
integrand1 = fsolve(Peng_Robinson_EOS(Pguess,V,T,Tc,Pc,ω),Pguess)
integrand2 = Pguess
integrand = integrand1-integrand2
return integrand
Vl = fsolve(Psat_integrand(V,Pguess,T,Tc,Pc,ω),0)
Vv_guess = Rgas*T/Pguess
Vv = fsolve(Psat_integrand(V,Pguess,T,Tc,Pc,ω),Vv_guess)
Vinf_guess = (Vl + Vv)/2
Vinf = fsolve(Psat_integrand(V,Pguess,T,Tc,Pc,ω),Vinf_guess)
left = quad(Psat_integrand(V,Pguess,T,Tc,Pc,ω),Vl,Vinf)[0]
right = quad(Psat_integrand(V,Pguess,T,Tc,Pc,ω),Vinf,Vv)[0]
diff = left + right
return diff
Psat = fsolve(integral_diff(Pguess,T,Tc,Pc,ω,V),Pguess)
return Psat
There are two issues with this code.
1: in theory, PR_Psat should not depend on V, since all values of V used in calculation are found via fsolve. However, because Peng_Robinson_EOS depends on V, it Python won't let it be ignored in enclosing functions. Is there a way to eliminate the need to "specify" V?
from an earlier version (before V was a parameter of all functions), to demonstrate:
runfile('...', wdir='...')
Traceback (most recent call last):
File "<ipython-input-4-0875bc6411e8>", line 1, in <module>
runfile('...', wdir='...')
File "...\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "...\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "...", line 40, in <module>
print(PR_Psat(300,647.1,22055000,0.345))
File "...", line 37, in PR_Psat
Psat = fsolve(integral_diff(Pguess,T,Tc,Pc,ω),Pguess)
File "...", line 25, in integral_diff
Vl = fsolve(Psat_integrand(V,Pguess,T,Tc,Pc,ω),0)
NameError: name 'V' is not defined
2: It seems that Peng_Robinson is not being treated as a callable function, but rather as a float. I'm not sure what is causing this.
runfile('...', wdir='...')
Traceback (most recent call last):
File "<ipython-input-13-0875bc6411e8>", line 1, in <module>
runfile('...', wdir='...')
File "C:\Users\Spencer\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "...\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "...", line 42, in <module>
print(PR_Psat(300,647.1,22055000,0.345,1))
File "...", line 39, in PR_Psat
Psat = fsolve(integral_diff(Pguess,T,Tc,Pc,ω,V),Pguess)
File "...", line 27, in integral_diff
Vl = fsolve(Psat_integrand(V,Pguess,T,Tc,Pc,ω),0)
File "...", line 23, in Psat_integrand
integrand1 = fsolve(Peng_Robinson_EOS(Pguess,V,T,Tc,Pc,ω),Pguess)
File "...\lib\site-packages\scipy\optimize\minpack.py", line 148, in fsolve
res = _root_hybr(func, x0, args, jac=fprime, **options)
File "...\lib\site-packages\scipy\optimize\minpack.py", line 214, in _root_hybr
shape, dtype = _check_func('fsolve', 'func', func, x0, args, n, (n,))
File "...\lib\site-packages\scipy\optimize\minpack.py", line 27, in _check_func
res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
TypeError: 'numpy.float64' object is not callable
In theory, Peng_Robinson_EOS should, if plotted as P(V) with constant T, produce a cubic regression. The goal of PR_Psat is to find the value of P for which the integrals between P and the cubic regression cancel. (Hence, integral_diff being plugged into fsolve)
To summarize the questions,
1) Is there a way to eliminate the need for V in PR_Psat?
2) Why is Peng_Robinson_EOS being flagged as an un-callable numpy.float64 object?
The problem was a syntax error. Arguments for fsolve and quad were placed wrong. To fix both problems, I moved the args to the back. For example,
incorrect:
Vv = fsolve(Psat_integrand(V,Pguess,T,Tc,Pc,ω),Vv_guess)
correct:
Vv = fsolve(Psat_integrand,Vv_guess,args=(Pguess,T,Tc,Pc,ω))
The reason Peng_Robinson_EOS is being called a numpy.float64 is because with the syntax of the code in the question the program is evaluated before being passed to the solver.
Also, with proper syntax, V is no longer an issue.
def PR_Psat(T,Tc,Pc,ω,Pguess = 1000):
def integral_diff (Pguess,T,Tc,Pc,ω):
def Psat_integrand (V,Pguess,T,Tc,Pc,ω):
integrand1 = fsolve(Peng_Robinson_EOS,Pguess,args=(V,T,Tc,Pc,ω))
integrand2 = Pguess
integrand = integrand1-integrand2
return integrand
Vl = fsolve(Psat_integrand,0,args=(Pguess,T,Tc,Pc,ω))
Vv_guess = Rgas*T/Pguess
Vv = fsolve(Psat_integrand,Vv_guess,args=(Pguess,T,Tc,Pc,ω))
Vinf_guess = (Vl + Vv)/2
Vinf = fsolve(Psat_integrand,Vinf_guess,args=(Pguess,T,Tc,Pc,ω))
left = quad(Psat_integrand,Vl,Vinf,args=(Pguess,T,Tc,Pc,ω))[0]
right = quad(Psat_integrand,Vinf,Vv,args=(Pguess,T,Tc,Pc,ω))[0]
diff = left + right
return diff
Psat = fsolve(integral_diff,Pguess,args=(T,Tc,Pc,ω))
return Psat

How to fix "Length of value doesnt match index" in python?

I've been trying to get the list of addresses from a franchise at Brazil, but when I run the code, it starts, runs two cities and then it stops and appears "ValueError"
The code I've been trying to run is this:
import requests
import json
import pandas as pd
dMun = pd.read_json('https://servicodados.ibge.gov.br/api/v1/localidades/municipios')
dEndTotal = pd.DataFrame()
for iMun in range(len(dMun)):
sCidade = dMun.loc[iMun,'nome']
print(str(iMun) + ' - '+ dMun.loc[iMun,'nome'])
sSigla = dMun.loc[iMun,'microrregiao']['mesorregiao']['UF']['sigla']
r = requests.post('https://www.5asec.com.br/busca-lojas-endereco', data = {'endereco':'A, 1 {}/{}'.format(sCidade,sSigla)})
jEnd = json.loads(r.text)
dEnd = pd.DataFrame.from_records(jEnd['lojas'])
print(dEnd)
if len(dEnd) > 0:
for sChave in jEnd['lojas'][0]['Endereco'].keys():
dEnd[sChave] = []
for i in range(len(dEnd)):
for sChave in jEnd['lojas'][i]['Endereco'].keys():
dEnd[sChave][i] = jEnd['lojas'][i]['Endereco'][sChave]
dEndTotal = pd.concat([dEndTotal,dEnd],ignore_index=False).drop_duplicates().reset_index(drop=True)
But its resulting on this error:
0 - Alta Floresta D'Oeste
Empty DataFrame
Columns: []
Index: []
1 - Ariquemes
CEP Codigo CodigoExterno ... Telefone TemEcommerce Url
0 76870512 675 69004P ... 35366864 False ariquemes
[1 rows x 16 columns]
Traceback (most recent call last):
File "<ipython-input-1-cd9a35514f7e>", line 1, in <module>
runfile('C:/Users/vinis/OneDrive/Área de Trabalho/5aSec.py', wdir='C:/Users/vinis/OneDrive/Área de Trabalho')
File "C:\Users\vinis\Anaconda2\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 786, in runfile
execfile(filename, namespace)
File "C:\Users\vinis\Anaconda2\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 95, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/vinis/OneDrive/Área de Trabalho/5aSec.py", line 38, in <module>
File "C:\Users\vinis\Anaconda2\lib\site-packages\pandas\core\frame.py", line 3370, in __setitem__
self._set_item(key, value)
File "C:\Users\vinis\Anaconda2\lib\site-packages\pandas\core\frame.py", line 3445, in _set_item
value = self._sanitize_column(key, value)
File "C:\Users\vinis\Anaconda2\lib\site-packages\pandas\core\frame.py", line 3630, in _sanitize_column
value = sanitize_index(value, self.index, copy=False)
File "C:\Users\vinis\Anaconda2\lib\site-packages\pandas\core\internals\construction.py", line 519, in sanitize_index
raise ValueError('Length of values does not match length of index')
ValueError: Length of values does not match length of index
How can i fix this one?
Thanks for the help, guys
And I'm sorry if the post isn't all correct

TypeError: unhashable type: 'slice'

I am trying to run a regression using the following dataframe dfMyRoll the head of the dataframe looks like:
SCORE SCORE_LAG
date
2007-10-29 -0.031551 NaN
2007-10-30 0.000100 -0.031551
2007-10-31 0.000100 0.000100
2007-11-01 0.000100 0.000100
2007-11-02 0.000100 0.000100
The code that I am using is :
import glob
import pandas as pd
import os.path
import scipy
from scipy.stats import linregress
def main():
dataPath = "C:/Users/Stacey/Documents/data/Roll"
roll = 4
1ID = "BBG.XNGS.AAPL.S"
2ID = "BBG.XNGS.AMAT.S"
print(1ID,1ID)
cointergration = getCointergration(dataPath,1ID,2ID,roll)
return
def getCointergration(dataPath,1ID,2ID,roll):
for myRoll in range((roll-4),roll,1):
path = dataPath+str(myRoll)+'/'
filename='PairData_'+1ID+'_'+2ID+'.csv'
for fname in glob.iglob(path+filename):
dfMyRoll = pd.read_csv(fname, header=0, usecols=[0,31],parse_dates=[0], dayfirst=True,index_col=[0], names=['date', 'SCORE'])
dfMyRoll['SCORE_LAG'] = dfMyRoll['SCORE'].shift(1)
print('cointergration',dfMyRoll.head())
X = dfMyRoll[1:,'SCORE']
Y = dfMyRoll[1:,'SCORE_LAG']
slope,intercept,_,_,stderr=linregress(dfMyRoll[1:,'SCORE'],dfMyRoll[1:,'SCORE_LAG'])
if __name__ == "__main__":
print ("CointergrationTest...19/05/17")
try:
main()
except KeyboardInterrupt:
print ("Ctrl+C pressed. Stopping...")
I get the error: TypeError: unhashable type: 'slice'. I have looked at previous posts on this subject and tried adding iloc to the X and Y time series in the following way:
X = dfMyRoll.iloc[1:,'SCORE']
Y = dfMyRoll.iloc[1:,'SCORE_LAG']
but unfortunately I can't seem to find a solution. Please see below for a stack trace:
Traceback (most recent call last):
File "<ipython-input-3-431422978139>", line 1, in <module>
runfile('C:/Users/Stacey/Documents/scripts/cointergrationTest.py', wdir='C:/Users/Stacey/Documents/scripts')
File "C:\Anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "C:\Anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 64, in <module>
main()
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 23, in main
cointergration = getCointergration(dataPath,1ID,2ID,roll)
File "C:/Users/Stacey/Documents/scripts/cointergrationTest.py", line 42, in getCointergration
X = dfMyRoll[1:,'SCORE']
File "C:\Anaconda\lib\site-packages\pandas\core\frame.py", line 2059, in __getitem__
return self._getitem_column(key)
File "C:\Anaconda\lib\site-packages\pandas\core\frame.py", line 2066, in _getitem_column
return self._get_item_cache(key)
File "C:\Anaconda\lib\site-packages\pandas\core\generic.py", line 1384, in _get_item_cache
res = cache.get(item)
TypeError: unhashable type: 'slice'
You need to use loc rather than iloc:
X = dfMyRoll.loc[1:,'SCORE']
Y = dfMyRoll.loc[1:,'SCORE_LAG']
iloc is read as "integer location", and only accepts integer position. loc is somewhat more forgiving and allows both (you can also use ix).

Python Code for plot. Receiving a message that I have 1 less Y value

from scitools.std import *
t = []
v = []
infile = open('running.txt', 'r')
for line in infile:
tnext, vnext = line.strip().split(',')
t.append(float(tnext))
v.append(float(vnext))
infile.close()
a = []
for i in range(len(t)-1):
a.append((v[i+1] - v[i])/(t[i+1] - t[i]))
s = []
for i in range(len(t)-1):
s.append((v[i+1])*(t[i+1]-t[i]))
plot(t, a)
plot(t, s)
This is the outcome of the code(error):
Traceback (most recent call last):
File "******1c.py", line 20, in <module>
plot(t, a)
File "/usr/lib/python2.6/site-packages/scitools/easyviz/common.py", line 3046, in plot
format=''))
File "/usr/lib/python2.6/site-packages/scitools/easyviz/common.py", line 372, in __init__
self.setp(**kwargs)
File "/usr/lib/python2.6/site-packages/scitools/easyviz/common.py", line 445, in setp
'not %d.' % (size(x),size(x),size(y))
AssertionError: Line.setp: x has size 1219, expected y to have size 1219, not 1218.
The problem is on the last line. I have 1219 x points and 1218 y points. How can I fix this?

Categories