How to improve efficiency in while loop by pandas - python

I am a new python er. in my job, I open deal mass of data. So I begin to study python to improve the efficiency.
The first small trial is that: finding the nearest distance between two coordinates.
I have two files, one is named as "book.csv", the other is named as "macro.csv".[file content screen shot][1]
book.csv has three column: BookName, Longitude,Latitude; macro.csv has threed column: MacroName, Longitude,Latitude.
the trial purpose is to find the nearest Macro to each book. I try to use pandas to finish this trial, now I can get the right result, but the efficiency is a little low, when I have a 1500 book and 200 macro, it will take about 15 second.
please to help whether I can improve the efficiency. thx the following is my trial code:
#import pandas lib
from pandas import Series,DataFrame
import pandas as pd
#import geopy lib, to calculate the distance between two poins
import geopy.distance
#def func, to calculate the distance, input parameter: two points coordinates(Lat,Lon),return m
def dist(coord1,coord2):
return geopy.distance.vincenty(coord1, coord2).m
#def func, to find the nearest result: including MacroName and distance
def find_nearest_macro(df_macro,df_book):
#Get column content from dataframe to series
# Macro
s_macro_name = df_macro["MacroName"]
s_macro_Lat = df_macro["Latitude"]
s_macro_Lon = df_macro["Longitude"]
# Book
s_book_name = df_book["BookName"]
s_book_Lat = df_book["Latitude"]
s_book_Lon = df_book["Longitude"]
#def a empty list, used to append nearest result
nearest_macro = []
nearest_dist = []
#Loop through each book
ibook = 0
while ibook < len(s_book_name):
#Give initial value to result
nearest_macro_name = s_macro_name[0]
nearest_macro_dist = dist((s_book_Lat[0],s_book_Lon[0]), (s_macro_Lat[0],s_macro_Lon[0]))
#Get the coordinate of the x book
book_coord = (s_book_Lat[ibook],s_book_Lon[ibook])
#Loop through each Macro, Reset the loop variable
imacro = 1
while imacro < len(s_macro_name):
# Get the coordinate of the x Macro
macro_cood = (s_macro_Lat[imacro],s_macro_Lon[imacro])
#Calculate the distance between book and macro
tempd = dist(book_coord,macro_cood)
#if distance more close
if tempd < nearest_macro_dist:
#Update the result
nearest_macro_dist = tempd
nearest_macro_name = s_macro_name[imacro]
#Increments the loop variable
imacro = imacro + 1
#Loop over each book, append the nearest to the result
nearest_macro.append(nearest_macro_name)
nearest_dist.append(nearest_macro_dist)
# Increments the loop variable
ibook = ibook + 1
#return nearest macro name and distance(by tuple way can return 2 results
return (nearest_macro,nearest_dist)
# Assign the filename:
file_macro = '.\\TestFile\\Macro.csv'
file_book = '.\\TestFile\\Book.csv'
#read content from csv to dataframe
df_macro = pd.read_csv(file_macro)
df_book = pd.read_csv(file_book)
#find the nearest macro name and distance
t_nearest_result = find_nearest_macro(df_macro,df_book)
#create a new series, convert list to Series
s_nearest_marco_name = Series(t_nearest_result[0])
s_nearest_macro_dist = Series(t_nearest_result[1])
#insert the new Series to dataframe
df_book["NearestMacro"] = s_nearest_marco_name
df_book["NearestDist"] = s_nearest_macro_dist
print(df_book.head())
# write the new df_book to a new csv file
df_book.to_csv('.\\TestFile\\nearest.csv')

Related

why does mean squared distance return NaN in MDAnalysis, python?

So I am pretty new to MDAnalysis and am trying to analyze data from LAMMPS simulations--trying to find self diffusivity and persistence length for a polymer. I built the universe from scratch in MDAnalysis because the LAMMPS data parser doesn't actually like reading LAMMPS data files. But the MSD returns a list of all NaN. I'm not sure if the problem is in that operation or in how I've set up the universe.
Here's the part of my code where I construct the universe and then try to find self-diffusivity:
#now, I create the universe
u = mda.Universe.empty(7,trajectory=True)
u.add_TopologyAttr('name',['agg1','agg2','agg3','agg4','agg5','end1','end2'])
u.add_TopologyAttr('type',['1','1','1','1','1','2','2'])
#add bonds
bonds = [(0,1),(1,2),(2,3),(3,4),(5,0),(4,6)]
u.add_TopologyAttr('bonds',bonds)
#load coords from numpy array
u.load_new(coords,order='fac')
timestep = 100000
MSD = msd.EinsteinMSD(u, select='all', msd_type='xyz', fft=True)
MSD.run()
msdResult = MSD.results.timeseries
print(msdResult)
lagtimes = np.arange(MSD.n_frames)*timestep
startTime = 0
startIndex = int(startTime/timestep)
endTime = 10000000
endIndex = int(endTime/timestep)
linear_model = linregress(lagtimes[startIndex:endIndex],
msdResult[startIndex:endIndex])
slope = linear_model.slope
error = linear_model.rvalue
D = slope * 1/(2*MSD.dim_fac)
print(D)
EDIT: I ran it again and it worked. didn't change a thing in my code so I have no idea why it worked. Oh well...

Concatenating tables with axis=1 in Orange python

I'm fairly new to Orange.
I'm trying to separate rows of angle (elv) into intervals.
Let's say, if I want to separate my 90-degree angle into 8 intervals, or 90/8 = 11.25 degrees per interval.
Here's the table I'm working with
Here's what I did originally, separating them by their elv value
Here's the result that I want, x rows 16 columns separated by their elv value.
But I want them done dynamically.
I list them out and turn each list into a table with x rows and 2 columns.
This is what I originally did
from Orange.data.table import Table
from Orange.data import Domain, Domain, ContinuousVariable, DiscreteVariable
import numpy
import pandas as pd
from pandas import DataFrame
df = pd.DataFrame()
num = 10 #number of intervals that we want to seperate our elv into.
interval = 90.00/num #separating them into degree/interval
low = 0
high = interval
table = []
first = []
second = []
for i in range(num):
between = []
if i != 0: #not the first run
low = high
high = high + interval
for row in in_data: #Run through the whole table to see if the elv falls in between interval
if row[0] >= low and row[0] < high:
between.append(row)
elv = "elv" + str(i)
err = "err" + str(i)
domain = Domain([ContinuousVariable.make(err)],[ContinuousVariable.make(elv)])
data = Table.from_numpy(domain, numpy.array(between))
print("table number ", i)
print(data[:3])
Here's the output
But as you can see, these are separated tables being assigned every loop.
And I have to find a way to concatenate axis = 1 for these tables.
Even the source code for Orange3 forbids this for some reason.

How to compare these data sets from a csv? Python 2.7

I have a project where I'm trying to create a program that will take a csv data set from www.transtats.gov which is a data set for airline flights in the US. My goal is to find the flight from one airport to another that had the worst delays overall, meaning it is the "worst flight". So far I have this:
`import csv
with open('826766072_T_ONTIME.csv') as csv_infile: #import and open CSV
reader = csv.DictReader(csv_infile)
total_delay = 0
flight_count = 0
flight_numbers = []
delay_totals = []
dest_list = [] #create empty list of destinations
for row in reader:
if row['ORIGIN'] == 'BOS': #only take flights leaving BOS
if row['FL_NUM'] not in flight_numbers:
flight_numbers.append(row['FL_NUM'])
if row['DEST'] not in dest_list: #if the dest is not already in the list
dest_list.append(row['DEST']) #append the dest to dest_list
for number in flight_numbers:
for row in reader:
if row['ORIGIN'] == 'BOS': #for flights leaving BOS
if row['FL_NUM'] == number:
if float(row['CANCELLED']) < 1: #if the flight is not cancelled
if float(row['DEP_DELAY']) >= 0: #and the delay is greater or equal to 0 (some flights had negative delay?)
total_delay += float(row['DEP_DELAY']) #add time of delay to total delay
flight_count += 1 #add the flight to total flight count
for row in reader:
for number in flight_numbers:
delay_totals.append(sum(row['DEP_DELAY']))`
I was thinking that I could create a list of flight numbers and a list of the total delays from those flight numbers and compare the two and see which flight had the highest delay total. What is the best way to go about comparing the two lists?
I'm not sure if I understand you correctly, but I think you should use dict for this purpose, where key is a 'FL_NUM' and value is total delay.
In general I want to eliminate loops in Python code. For files that aren't massive I'll typically read through a data file once and build up some dicts that I can analyze at the end. The below code isn't tested because I don't have the original data but follows the general pattern I would use.
Since a flight is identified by the origin, destination, and flight number I would capture them as a tuple and use that as the key in my dict.
from collections import defaultdict
flight_delays = defaultdict(list) # look this up if you aren't familiar
for row in reader:
if row['ORIGIN'] == 'BOS': #only take flights leaving BOS
if row['CANCELLED'] > 0:
flight = (row['ORIGIN'], row['DEST'], row['FL_NUM'])
flight_delays[flight].append(float(row['DEP_DELAY']))
# Finished reading through data, now I want to calculate average delays
worst_flight = ""
worst_delay = 0
for flight, delays in flight_delays.items():
average_delay = sum(delays) / len(delays)
if average_delay > worst_delay:
worst_flight = flight[0] + " to " + flight[1] + " on FL#" + flight[2]
worst_delay = average_delay
A very simple solution would be. Adding two new variables:
max_delay = 0
delay_flight = 0
# Change: if float(row['DEP_DELAY']) >= 0: FOR:
if float(row['DEP_DELAY']) > max_delay:
max_delay = float(row['DEP_DELAY'])
delay_flight = #save the row number or flight number for reference.

Need help writing code that will automatically write more code?

I need help with writing code for a work project. I have written a script that uses pandas to read an excel file. I have a while-loop written to iterate through each row and append latitude/longitude data from the excel file onto a map (Folium, Open Street Map)
The issue I've run into has to do with the GPS data. I download a CVS file with vehicle coordinates. On some of the vehicles I'm tracking, the GPS loses signal for whatever reason and doesn't come back online for hundreds of miles. This causes issues when I'm using line plots to track the vehicle movement on the map. I end up getting long straight lines running across cities since Folium is trying to connect the last GPS coordinate before the vehicle went offline, with the next GPS coordinate available once the vehicle is back online, which could be hundreds of miles away as shown here. I think if every time the script finds a gap in GPS coords, I can have a new loop generated that will basically start a completely new line plot and append it to the existing map. This way I should still see the entire vehicle route on the map but without the long lines trying to connect broken points together.
My idea is to have my script calculate the absolute value difference between each iteration of longitude data. If the difference between each point is greater than 0.01, I want my program to end the loop and to start a new loop. This new loop would then need to have new variables init. I will not know how many new loops would need to be created since there's no way to predict how many times the GPS will go offline/online in each vehicle.
https://gist.github.com/tapanojum/81460dd89cb079296fee0c48a3d625a7
import folium
import pandas as pd
# Pulls CSV file from this location and adds headers to the columns
df = pd.read_csv('Example.CSV',names=['Longitude', 'Latitude',])
lat = (df.Latitude / 10 ** 7) # Converting Lat/Lon into decimal degrees
lon = (df.Longitude / 10 ** 7)
zoom_start = 17 # Zoom level and starting location when map is opened
mapa = folium.Map(location=[lat[1], lon[1]], zoom_start=zoom_start)
i = 0
j = (lat[i] - lat[i - 1])
location = []
while i < len(lat):
if abs(j) < 0.01:
location.append((lat[i], lon[i]))
i += 1
else:
break
# This section is where additional loops would ideally be generated
# Line plot settings
c1 = folium.MultiPolyLine(locations=[location], color='blue', weight=1.5, opacity=0.5)
c1.add_to(mapa)
mapa.save(outfile="Example.html")
Here's pseudocode for how I want to accomplish this.
1) Python reads csv
2) Converts Long/Lat into decimal degrees
3) Init location1
4) Runs while loop to append coords
5) If abs(j) >= 0.01, break loop
6) Init location(2,3,...)
7) Generates new while i < len(lat): loop using location(2,3,...)
9) Repeats step 5-7 while i < len(lat) (Repeat as many times as there are
instances of abs(j) >= 0.01))
10) Creats (c1, c2, c3,...) = folium.MultiPolyLine(locations=[location], color='blue', weight=1.5, opacity=0.5) for each variable of location
11) Creates c1.add_to(mapa) for each c1,c2,c3... listed above
12) mapa.save
Any help would be tremendously appreciated!
UPDATE:
Working Solution
import folium
import pandas as pd
# Pulls CSV file from this location and adds headers to the columns
df = pd.read_csv(EXAMPLE.CSV',names=['Longitude', 'Latitude'])
lat = (df.Latitude / 10 ** 7) # Converting Lat/Lon into decimal degrees
lon = (df.Longitude / 10 ** 7)
zoom_start = 17 # Zoom level and starting location when map is opened
mapa = folium.Map(location=[lat[1], lon[1]], zoom_start=zoom_start)
i = 1
location = []
while i < (len(lat)-1):
location.append((lat[i], lon[i]))
i += 1
j = (lat[i] - lat[i - 1])
if abs(j) > 0.01:
c1 = folium.MultiPolyLine(locations=[location], color='blue', weight=1.5, opacity=0.5)
c1.add_to(mapa)
location = []
mapa.save(outfile="Example.html")
Your while loop looks wonky. You only set j once, outside the loop. Also, I think you want a list of line segments. Did you want something like this;
i = 0
segment = 0
locations = []
while i < len(lat):
locations[segment] = [] # start a new segment
# add points to the current segment until all are
# consumed or a disconnect is detected
while i < len(lat):
locations[segment].append((lat[i], lon[i]))
i += 1
j = (lat[i] - lat[i - 1])
if abs(j) > 0.01:
break
segment += 1
When this is done locations will be a list of segments, e.g.;
[ segment0, segment1, ..... ]
each segment will be a list of points, e.g.;
[ (lat,lon), (lan,lon), ..... ]

Resolving TypeError in Python code

I'm basically running some code as follows. Basically I'm just retrieving pairs of stocks (laid out as Row 1-Stock 1,2, Row 2-Stock 1,2 and so on, where Stock 1 and 2 are different in each row) from a CSV File. I then take in data from Yahoo associated with these "Pairs" of Stocks. I calculate the returns of the stocks and basically check if the distance (difference in returns) between a pair of stocks breaches some threshold and if so I return 1. However, I'm getting the following error which I am unable to resolve:
PricePort(tickers)
27 for ticker in tickers:
28 #print ticker
---> 29 x = pd.read_csv('http://chart.yahoo.com/table.csv?s=ttt'.replace('ttt',ticker),usecols=[0,6],index_col=0)
30 x.columns=[ticker]
31 final=pd.merge(final,x,left_index=True,right_index=True)
TypeError: expected a character buffer object
The code is as follows:
from datetime import datetime
import pytz
import csv
import pandas as pd
import pandas.io.data as web
import numpy as np
#Retrieves pairs of stocks (laid out as Row 1-Stock 1,2, Row 2-Stock 1,2 and so on, where Stock 1 and 2 are different in each row) from CSV File
def Dataretriever():
Pairs = []
f1=open('C:\Users\Pythoncode\Pairs.csv') #Enter the location of the file
csvdata= csv.reader(f1)
for row in csvdata: #reading tickers from the csv file
Pairs.append(row)
return Pairs
tickers = Dataretriever() #Obtaining the data
#Taking in data from Yahoo associated with these "Pairs" of Stocks
def PricePort(tickers):
"""
Returns historical adjusted prices of a portfolio of stocks.
tickers=pairs
"""
final=pd.read_csv('http://chart.yahoo.com/table.csv?s=^GSPC',usecols=[0,6],index_col=0)
final.columns=['^GSPC']
for ticker in tickers:
#print ticker
x = pd.read_csv('http://chart.yahoo.com/table.csv?s=ttt'.replace('ttt',ticker),usecols=[0,6],index_col=0)
x.columns=[ticker]
final=pd.merge(final,x,left_index=True,right_index=True)
return final
#Calculating returns of the stocks
def Returns(tickers):
l = []
begdate=(2014,1,1)
enddate=(2014,6,1)
p = PricePort(tickers)
ret = (p.close[1:] - p.close[:-1])/p.close[1:]
l.append(ret)
return l
#Basically a class to see if the distance (difference in returns) between a
#pair of stocks breaches some threshold
class ThresholdClass():
#constructor
def __init__(self, Pairs):
self.Pairs = Pairs
#Calculating the distance (difference in returns) between a pair of stocks
def Distancefunc(self, tickers):
k = 0
l = Returns(tickers)
summation=[[0 for x in range (k)]for x in range (k)] #2d matrix for the squared distance
for i in range (k):
for j in range (i+1,k): # it will be a upper triangular matrix
for p in range (len(self.PricePort(tickers))-1):
summation[i][j]= summation[i][j] + (l[i][p] - l[j][p])**2 #calculating distance
for i in range (k): #setting the lower half of the matrix 1 (if we see 1 in the answer we will set a higher limit but typically the distance squared is less than 1)
for j in range (i+1):
sum[i][j]=1
return sum
#This function is used in determining the threshold distance
def MeanofPairs(self, tickers):
sum = self.Distancefunc(tickers)
mean = np.mean(sum)
return mean
#This function is used in determining the threshold distance
def StandardDeviation(self, tickers):
sum = self.Distancefunc(tickers)
standard_dev = np.std(sum)
return standard_dev
def ThresholdandnewsChecker(self, tickers):
threshold = self.MeanofPairs(tickers) + 2*self.StandardDeviation(tickers)
if (self.Distancefunc(tickers) > threshold):
return 1
Threshold_Class = ThresholdClass(tickers)
Threshold_Class.ThresholdandnewsChecker(tickers,1)
The trouble is Dataretriever() returns a list, not a string. When you iterate over tickers(), the name ticker is bound to a list.
The str.replace method expects both arguments to be strings. The following code raises the error because the second argument is a list:
'http://chart.yahoo.com/table.csv?s=ttt'.replace('ttt', ticker)
The subsequent line x.columns = [ticker] will cause similar problems. Here, ticker needs to be a hashable object (like a string or integer), but lists are not hashable.

Categories