I am trying to return the zipcode as column in a dataframe. This code works but doesn't create a new column in the dataframe gps.
import geocoder
import multiprocessing as mp
import pandas as pd
google_key = 'key'
def reverse_gecode(coordinates):
return geocoder.google(coordinates, key = google_key, method = 'reverse').postal
if __name__ == '__main__':
gps = pd.DataFrame({'lat': [27.950575, 40.6936488],
'lon': [-82.4571776, -89.5889864]}) # dataframe mehtod
gps['gps'] = zip(gps.lat, gps.lon)
x = list(gps['gps'])
# multiprocessings
pool = mp.Pool(processes = (mp.cpu_count() - 1))
result_latlong = pool.map(reverse_gecode, x)
pool.close()
pool.join()
I have tried
gps['zip_code'] = gps.apply(lambda x: pool.map(reverse_gecode,
list(x[2])),
axis = 1)
gps['zip_code'] = gps.apply(lambda x: pool.map(reverse_gecode, x[2]),
axis = 1)
gps['zip_code'] = gps.apply(lambda x: pool.map(reverse_gecode, [x[0], x[1]]),
axis = 1)
But I just cannot get anything to work. The error I keep getting is
ValueError: ('Unknown location: 27.950575', u'occurred at index 0')
try is:
import geocoder
import multiprocessing as mp
import pandas as pd
def reverse_gecode(coordinates):
return geocoder.google(coordinates, method = 'reverse').postal
if __name__ == '__main__':
gps = pd.DataFrame({'lat': [27.950575, 40.6936488],
'lon': [-82.4571776, -89.5889864]}) # dataframe mehtod
coords = gps[['lat','lon']].astype(str).apply(lambda x: (x[0],x[1]), axis=1).tolist()
# multiprocessings
pool = mp.Pool(processes = (mp.cpu_count() - 1))
gps['zip_code'] = pool.map(reverse_gecode, coords)
print(gps)
pool.close()
pool.join()
PS i've removed key=google_key in the geocoder.google() call, because it didn't work for me
Output:
lat lon zip_code
0 27.950575 -82.457178 33602
1 40.693649 -89.588986 61603
Related
I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)
Cost function implemented with Python:
**Thanks for help to achieve this.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
load_data = pd.read_csv('C:\python_program\ex1data1.txt',sep = ",",header = None)
feature_vale = load_data[0]
y = np.matrix(load_data[1])
m = len(feature_vale)
plt.scatter(load_data[0],load_data[1],marker='+',c = 'r')
plt.title("Cost_Function")
plt.xlabel("Population of City in 10,000s")
plt.ylabel("Profit in $10,000s")
df = pd.DataFrame(pd.Series(1,index= range(0,m)))
df[1] = load_data[0]
X = np.matrix(df)
row_theta = np.zeros(2,dtype = int)
theta = np.array([row_theta]) # Transpose the array
prediction = np.dot(X,theta.T)
error = (prediction-y.T)
error_df = pd.DataFrame(error)
#square the error
squared_error = np.square(error_df)
sum = np.sum(squared_error)
print(sum)
J = np.sum(squared_error) / (2 * m)
print(J)
Data reference link: searchcode.com/codesearch/view/5404318
repeat the following steps and let me know
load_data = pd.read_csv('data.txt',sep = ",",header = None)
feature_vale = load_data[0]
y = np.matrix(load_data[1])
m = len(feature_vale)
#print(m)
#plt.scatter(load_data[0],load_data[1])
df = pd.DataFrame(pd.Series(1,index= range(0,m)))
df[1] = load_data[0]
X = np.matrix(df)
row_theta = np.zeros(2,dtype = int)
theta = np.array([row_theta]) # Transpose the array
print(theta.T)
prediction = np.matmul(X,theta.T)
error = (prediction-y)
error_df = pd.DataFrame(error)
squared_error = np.square(error_df)
print(squared_error)
I have an interesting problem. I have two files, NYPD_Motor_Collisions.csv has 1.2M lines and weatherfinal.txt has 109K lines. The objective is to merge the temp and prec data from weatherfinal.txt to the Collisions files as two columns based on the latitudes and longitudes. I wrote the following code using dataframe in pandas python.
from math import cos, asin, sqrt
import pandas as pd
import numpy as np
import os
import re
import datetime
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295
a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
return 12742 * asin(sqrt(a))
def closest(data, v):
return min(data, key=lambda p: distance(v['lat'],v['lon'],p['lat'],p['lon']))
tempDataList = []
#v = {'lat': 39.7622290, 'lon': -86.1519750}
#print(closest(tempDataList, v))
print os.getcwd()
filed_ = open("weatherfinal.txt", 'r')
fileo_ = open("weatherfinal_updated.txt","w")
lines_ = filed_.readlines()
for line_ in lines_:
outline = re.sub(" +"," ",line_)
fileo_.write(outline + "\n")
fileo_.close()
df = pd.read_csv("NYPD_Motor_Vehicle_Collisions.csv")
colhead = np.append(df.columns.values,['TEMP', 'PREP'])
outdf = pd.DataFrame(columns=colhead)
df2 = pd.read_csv("weatherfinal_updated.txt",' ')
df2.set_index(['WBANNO', 'LST_DATE', 'LST_TIME'])
sensorIds = df2['WBANNO'].unique()
for ids_ in sensorIds:
longitude = df2.loc[df2['WBANNO']==ids_,'LONGITUDE'].iloc[0]
latitude = df2.loc[df2['WBANNO'] == ids_, 'LATITUDE'].iloc[0]
tempDataList.append({'lat':latitude,'lon':longitude,'SENSORID': ids_ })
print tempDataList
for index, row in df.iterrows():
lon_ = row['LONGITUDE']
lat_ = row['LATITUDE']
tdate = row['DATE']
ttime = row['TIME']
tcal = 5
pcal = 0
fwdate = datetime.datetime.strptime(str(tdate), '%m/%d/%Y').strftime('%Y%m%d')
fwtime = datetime.datetime.strptime(str(ttime), '%H:%M').strftime('%H%M')
ntime = float(fwtime) + float(100)
closests_ = closest(tempDataList, {'lat':lat_,'lon':lon_})
sensorid = closests_['SENSORID']
usedSensorId = sensorid
selectedWeatherRow = df2.loc[(df2.WBANNO == sensorid) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime) ,['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
for sensId in sensorIds:
if sensId == sensorid:
continue
selectedWeatherRow = df2.loc[(df2.WBANNO == sensId) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime), ['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
continue
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
usedSensorId = sensId
break
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
row['TEMP'] = tcal
row['PREP'] = pcal
outdf.loc[index] = row
print index, tcal, pcal, fwdate, fwtime, ntime, usedSensorId
print "Loop completed"
outdf.to_csv("NYPD_TRAFFIC_DATA.csv")
print "file completed"
This program has been running for days. Not sure why dataframe is too slow. I rewrote the program without dataframe using dictionaries and it completed in a few minutes. Not sure if dataframe is slow or I am not using it correctly. Just posting here for learning.
I am attempting to speed up calculations on a pandas DataFrame using multiprocessing which goes really well minus the fact that assigning the result of the calculation to the df.ix does not work here like it does in my code without trying multiprocessing here
I've added a #sanity check to the code which outputs valid values and would make me think this would work just fine, but the DataFrame doesn't get populated (stays as NaN). Does anyone know why that may be, and more importantly, what changes may be needed to plug the values into the DataFrame in the context of multiprocessing?
Output of sanity check:
should be setting df.ix[4][1] to: 23.2506112824
should be setting df.ix[0][0] to: 0.0
should be setting df.ix[7][0] to: 15.9574526264
code:
import mysql.connector
import numpy as np
from colormath.color_objects import LabColor
from colormath.color_diff import delta_e_cie2000
import pandas as pd
from mysql.connector.pooling import MySQLConnectionPool
from multiprocessing import Pool
pool = Pool()
cnx = mysql.connector.connect(user='user', password='pass',host='localhost', database='database')
cursor = cnx.cursor()
selectstmt = 'SELECT CIE_Lab, ID FROM `database`.`table`'
cursor.execute(selectstmt)
color = cursor.fetchall()
df = pd.DataFrame(columns = color, index = color)
sides = df.index
headers = df.dtypes.index
shape = df.shape[0]
def delta(cie_Lab1, cie_Lab2):
cie_Lab1 = cie_Lab1[1:]
cie_Lab1 = cie_Lab1[:-1]
cie_Lab2 = cie_Lab2[1:]
cie_Lab2 = cie_Lab2[:-1]
CIE_list1 = cie_Lab1.split(",")
CIE_list2 = cie_Lab2.split(",")
#print CIE_list1
CIE_L1 = CIE_list1[0]
CIE_a1 = CIE_list1[1]
CIE_b1 = CIE_list1[2]
CIE_L2 = CIE_list2[0]
CIE_a2 = CIE_list2[1]
CIE_b2 = CIE_list2[2]
color1 = LabColor(lab_l=CIE_L1, lab_a=CIE_a1, lab_b=CIE_b1)
color2 = LabColor(lab_l=CIE_L2, lab_a=CIE_a2, lab_b=CIE_b2)
deltae = delta_e_cie2000(color1, color2, Kl=1, Kc=1, Kh=1)
return deltae
def deltas(nums):
listoflists = []
for num in range(nums):
for mun in range(nums):
listoflists.append([num,mun])
return listoflists
def update(inp):
sides = df.index
headers = df.dtypes.index
num = inp[0]
mun = inp[1]
res = delta(headers[num][0], sides[mun][0])
#sanity check
print "should be setting df.ix["+str(mun)+"]["+str(num)+"] to: "+str(res)
df.ix[mun][num] = res
if __name__ == '__main__':
pool = Pool(4)
pool.map(update, deltas(shape))
pool.close()
pool.join()
print df
Dataframe example:
([69.62248143012944, -54.15108764844451, 67.92070706614288], 1) \
([69.62248143012944, -54.15108764844451, 67.920... NaN
([58.17848217611454, -52.251714243997995, 56.77... NaN
([87.02539335188214, -32.15758725885986, 66.450... NaN
([86.86259502866965, -31.483524711078015, 75.14... NaN
([85.39154525710671, -31.683349117376856, 71.35... NaN
I am trying to calculate the duration of the drawdowns and the time to recovery for a stock series. I can calculate the drawdowns but am struggling to the the durations and recovery time for each drawdown. So far I have this code:
import pandas as pd
import pickle
import xlrd
import numpy as np
np.random.seed(0)
df = pd.Series(np.random.randn(2500)*0.7+0.05, index=pd.date_range('1/1/2000', periods=2500, freq='D'))
df= 100*(1+df/100).cumprod()
df=pd.DataFrame(df)
df.columns = ['close']
df['ret'] = df.close/df.close[0]
df['modMax'] = df.ret.cummax()
df['modDD'] = 1-df.ret.div(df['modMax'])
groups = df.groupby(df['modMax'])
dd = groups['modMax','modDD'].apply(lambda g: g[g['modDD'] == g['modDD'].max()])
top10dd = dd.sort_values('modDD', ascending=False).head(10)
top10dd
This gives the 10 highest drawdowns of the series but I also want the duration of the drawdown and time to recovery.
I solved the problem as follows:
def drawdown_group(df,index_list):
group_max,dd_date = index_list
ddGroup = df[df['modMax'] == group_max]
group_length = len(ddGroup)
group_dd = ddGroup['dd'].max()
group_dd_length = len(ddGroup[ddGroup.index <= dd_date])
group_start = ddGroup[0:1].index[0]
group_end = ddGroup.tail(1).index[0]
group_rec = group_length - group_dd_length
#print (group_start,group_end,group_dd,dd_date,group_dd_length,group_rec,group_length)
return group_start,group_end,group_max,group_dd,dd_date,group_dd_length,group_rec,group_length
dd_col = ('start','end','peak', 'dd','dd_date','dd_length','dd_rec','tot_length')
df_dd = pd.DataFrame(columns = dd_col)
for i in range(1,10):
index_list = top10dd[i-1:i].index.tolist()[0]
#print(index_list)
start,end,peak,dd,dd_date,dd_length,dd_rec,tot_length = drawdown_group(df,index_list)
#print(start,end,dd,dd_date,dd_length,dd_rec,tot_length)
df_dd.loc[i-1] = drawdown_group(df,index_list)
Produces this table: