Remove outlier from multiple lists in python - python

I'm very new to Python and am working on plotting a graph with matplotlib with values from a csv and am trying to figure out the most efficient way to remove outliers from my lists. The CSV has three variables, x, y, z, which I've put into separate lists.
I want to find the standard deviation of each list and remove each point that is < or > 2x stdev (remove the point from each list - x, y, z, not just one list).
I'm having a hard time figuring out how to efficiently remove a point that is represented in three separate lists while making sure that I don't mix up different data points.
Do I use while loop and delete the value at a certain position for each variable? If so, how would I reference the position in the list where then number is larger than 2x stdev? Thanks!
import matplotlib.pyplot as plt
import csv
import statistics as stat
#making list of each variable
x = []
y = []
z = []
with open('fundata.csv', 'r') as csvfile:
plots = csv.reader(csvfile, delimiter = ',')
#skip the header line in CSV
next(plots)
#import each variable from the CSV file into a list as a float
for row in plots:
x.append(float(row[0]))
y.append(float(row[1]))
z.append(float(row[2]))
#cleaning up the data
stdev_x = stat.stdev(x)
stdev_y = stat.stdev(y)
stdev_z = stat.stdev(z)
print(stdev_x)
print(stdev_y)
print(stdev_z)
#making the graph
fig, ax = plt.subplots()
#make a scatter plot graphing x by y with z variable as color, size of each point is 3
ax.scatter(x, y, c=z, s=3)
#Set chart title and label the axes
ax.set_title("Heatmap of variables", fontsize = 18)
ax.set_xlabel("Var 1", fontsize = 14)
ax.set_ylabel("Var 2", fontsize = 14)
#open Matplotlib viewer
plt.show()
Data set is as follows but is ~35000 rows long with more variability:
var1
var2
var3
3876514
3875931
3875846
3876515
3875931
3875846
3876516
3875931
3875846

It is nearly always easier to use pandas to deal with data of this kind. Calculate the row-wise means and standard deviations, then select values within the required range. The outliers will be replaced with missing values. You can then use dropna to drop all the rows that contain missing values.
import pandas as pd
df = pd.read_csv("fundata.csv", names=["x", "y", "z"])
mean = df.mean(axis=0)
std = df.std(axis=0)
edited = df[(mean - 2 * std <= df) & (df <= mean + 2 * std)].dropna()
Alternatively, use scipy.stats.zscore, which will do the calculation for you:
from scipy.stats import zscore
...
edited = df[(abs(zscore(df)) <= 2).all(axis=1)]
If you want to avoid pandas for some reason, then one way would be to replace all the outliers within each column with None:
def replace_outliers(values):
mean = statistics.mean(values)
stdev = statistics.stdev(values)
for v in values:
if mean - 2 * stdev <= v <= mean + 2 * stdev:
yield v
else:
yield None
x, y, z = [replace_outliers(column) for column in [x, y, z]]
Then zip the columns together and select rows that do not contain None:
selected_rows = [row for row in zip(x, y, z) if not None in row]
Finally if needed you can zip the rows together to transpose the data back into three column lists:
x, y, z = zip(*selected_rows)

Related

Draw longest possible vertical line between two curves in seaborn

I currently have a plot like this (consider that data is the dataframe I pasted at the very bottom):
import seaborn as sns
sns.relplot(
data = data,
x = "Threshold",
y = "Value",
kind = "line",
hue="Metric"
).set(xlabel="Threshold")
Which produces:
Now, I want to know how can I annotate a line in this plot, such that it is located between the curves, at the x-Axis value where the distance between curves are maximized. I would also need to annotate text to show the distance value.
It should be something like this:
Here is the pandas dataframe:
Threshold,Metric,Value
0.0,Recall,1.0
0.010101010101010102,Recall,0.9802536231884058
0.020202020202020204,Recall,0.9706521739130435
0.030303030303030304,Recall,0.9621376811594203
0.04040404040404041,Recall,0.9541666666666667
0.05050505050505051,Recall,0.9456521739130435
0.06060606060606061,Recall,0.9322463768115942
0.07070707070707072,Recall,0.9173913043478261
0.08080808080808081,Recall,0.908695652173913
0.09090909090909091,Recall,0.8976449275362319
0.10101010101010102,Recall,0.8813405797101449
0.11111111111111112,Recall,0.8644927536231884
0.12121212121212122,Recall,0.8498188405797101
0.13131313131313133,Recall,0.8358695652173913
0.14141414141414144,Recall,0.818659420289855
0.15151515151515152,Recall,0.7967391304347826
0.16161616161616163,Recall,0.7748188405797102
0.17171717171717174,Recall,0.7521739130434782
0.18181818181818182,Recall,0.7269927536231884
0.19191919191919193,Recall,0.6952898550724638
0.20202020202020204,Recall,0.6704710144927536
0.21212121212121213,Recall,0.648731884057971
0.22222222222222224,Recall,0.6097826086956522
0.23232323232323235,Recall,0.5847826086956521
0.24242424242424243,Recall,0.5521739130434783
0.25252525252525254,Recall,0.5023550724637681
0.26262626262626265,Recall,0.4766304347826087
0.27272727272727276,Recall,0.42047101449275365
0.2828282828282829,Recall,0.3958333333333333
0.29292929292929293,Recall,0.3539855072463768
0.30303030303030304,Recall,0.3327898550724638
0.31313131313131315,Recall,0.3036231884057971
0.32323232323232326,Recall,0.2798913043478261
0.33333333333333337,Recall,0.2371376811594203
0.3434343434343435,Recall,0.22119565217391304
0.3535353535353536,Recall,0.17300724637681159
0.36363636363636365,Recall,0.15996376811594204
0.37373737373737376,Recall,0.13568840579710145
0.38383838383838387,Recall,0.11938405797101449
0.393939393939394,Recall,0.10652173913043478
0.4040404040404041,Recall,0.09891304347826087
0.4141414141414142,Recall,0.08894927536231884
0.42424242424242425,Recall,0.07681159420289856
0.43434343434343436,Recall,0.06557971014492754
0.4444444444444445,Recall,0.05253623188405797
0.4545454545454546,Recall,0.04655797101449275
0.4646464646464647,Recall,0.024456521739130436
0.4747474747474748,Recall,0.019384057971014494
0.48484848484848486,Recall,0.009782608695652175
0.494949494949495,Recall,0.0034420289855072463
0.5050505050505051,Recall,0.002173913043478261
0.5151515151515152,Recall,0.0016304347826086956
0.5252525252525253,Recall,0.0007246376811594203
0.5353535353535354,Recall,0.00018115942028985507
0.5454545454545455,Recall,0.0
0.5555555555555556,Recall,0.0
0.5656565656565657,Recall,0.0
0.5757575757575758,Recall,0.0
0.5858585858585859,Recall,0.0
0.595959595959596,Recall,0.0
0.6060606060606061,Recall,0.0
0.6161616161616162,Recall,0.0
0.6262626262626263,Recall,0.0
0.6363636363636365,Recall,0.0
0.6464646464646465,Recall,0.0
0.6565656565656566,Recall,0.0
0.6666666666666667,Recall,0.0
0.6767676767676768,Recall,0.0
0.686868686868687,Recall,0.0
0.696969696969697,Recall,0.0
0.7070707070707072,Recall,0.0
0.7171717171717172,Recall,0.0
0.7272727272727273,Recall,0.0
0.7373737373737375,Recall,0.0
0.7474747474747475,Recall,0.0
0.7575757575757577,Recall,0.0
0.7676767676767677,Recall,0.0
0.7777777777777778,Recall,0.0
0.787878787878788,Recall,0.0
0.797979797979798,Recall,0.0
0.8080808080808082,Recall,0.0
0.8181818181818182,Recall,0.0
0.8282828282828284,Recall,0.0
0.8383838383838385,Recall,0.0
0.8484848484848485,Recall,0.0
0.8585858585858587,Recall,0.0
0.8686868686868687,Recall,0.0
0.8787878787878789,Recall,0.0
0.888888888888889,Recall,0.0
0.8989898989898991,Recall,0.0
0.9090909090909092,Recall,0.0
0.9191919191919192,Recall,0.0
0.9292929292929294,Recall,0.0
0.9393939393939394,Recall,0.0
0.9494949494949496,Recall,0.0
0.9595959595959597,Recall,0.0
0.9696969696969697,Recall,0.0
0.9797979797979799,Recall,0.0
0.98989898989899,Recall,0.0
1.0,Recall,0.0
0.0,Fall-out,1.0
0.010101010101010102,Fall-out,0.6990465720990212
0.020202020202020204,Fall-out,0.58461408367334
0.030303030303030304,Fall-out,0.516647992727734
0.04040404040404041,Fall-out,0.4643680104855929
0.05050505050505051,Fall-out,0.4172674037587468
0.06060606060606061,Fall-out,0.3796376551170116
0.07070707070707072,Fall-out,0.3507811343889394
0.08080808080808081,Fall-out,0.33186055852694335
0.09090909090909091,Fall-out,0.3152231359533222
0.10101010101010102,Fall-out,0.29964272879098575
0.11111111111111112,Fall-out,0.2855844238208993
0.12121212121212122,Fall-out,0.27161068008371564
0.13131313131313133,Fall-out,0.25719298987379235
0.14141414141414144,Fall-out,0.24338836860241422
0.15151515151515152,Fall-out,0.2312538316808659
0.16161616161616163,Fall-out,0.22026087140350506
0.17171717171717174,Fall-out,0.2083377375642137
0.18181818181818182,Fall-out,0.19694311143056467
0.19191919191919193,Fall-out,0.18402638310466565
0.20202020202020204,Fall-out,0.17440754286197493
0.21212121212121213,Fall-out,0.16548633279073208
0.22222222222222224,Fall-out,0.15278100754709004
0.23232323232323235,Fall-out,0.14292962391391667
0.24242424242424243,Fall-out,0.1317252605542989
0.25252525252525254,Fall-out,0.11555292476164303
0.26262626262626265,Fall-out,0.10612434729298353
0.27272727272727276,Fall-out,0.08902183793839714
0.2828282828282829,Fall-out,0.08331395471745978
0.29292929292929293,Fall-out,0.07232099444009894
0.30303030303030304,Fall-out,0.06735302200706086
0.31313131313131315,Fall-out,0.061454876012092256
0.32323232323232326,Fall-out,0.05665602604485973
0.33333333333333337,Fall-out,0.048982094158932836
0.3434343434343435,Fall-out,0.045641925459273196
0.3535353535353536,Fall-out,0.03748176648415534
0.36363636363636365,Fall-out,0.0341415977844957
0.37373737373737376,Fall-out,0.029321607509037482
0.38383838383838387,Fall-out,0.026996173604211148
0.393939393939394,Fall-out,0.024353635075999407
0.4040404040404041,Fall-out,0.022514428260364035
0.4141414141414142,Fall-out,0.01940680295118703
0.42424242424242425,Fall-out,0.017165930279263473
0.43434343434343436,Fall-out,0.014459970826374648
0.4444444444444445,Fall-out,0.011035240893812233
0.4545454545454546,Fall-out,0.009386296852208105
0.4646464646464647,Fall-out,0.004756569350781135
0.4747474747474748,Fall-out,0.003868676405301989
0.48484848484848486,Fall-out,0.002135171130795087
0.494949494949495,Fall-out,0.0008033317125763693
0.5050505050505051,Fall-out,0.0004228061645138786
0.5151515151515152,Fall-out,0.00031710462338540896
0.5252525252525253,Fall-out,4.228061645138786e-05
0.5353535353535354,Fall-out,0.0
0.5454545454545455,Fall-out,0.0
0.5555555555555556,Fall-out,0.0
0.5656565656565657,Fall-out,0.0
0.5757575757575758,Fall-out,0.0
0.5858585858585859,Fall-out,0.0
0.595959595959596,Fall-out,0.0
0.6060606060606061,Fall-out,0.0
0.6161616161616162,Fall-out,0.0
0.6262626262626263,Fall-out,0.0
0.6363636363636365,Fall-out,0.0
0.6464646464646465,Fall-out,0.0
0.6565656565656566,Fall-out,0.0
0.6666666666666667,Fall-out,0.0
0.6767676767676768,Fall-out,0.0
0.686868686868687,Fall-out,0.0
0.696969696969697,Fall-out,0.0
0.7070707070707072,Fall-out,0.0
0.7171717171717172,Fall-out,0.0
0.7272727272727273,Fall-out,0.0
0.7373737373737375,Fall-out,0.0
0.7474747474747475,Fall-out,0.0
0.7575757575757577,Fall-out,0.0
0.7676767676767677,Fall-out,0.0
0.7777777777777778,Fall-out,0.0
0.787878787878788,Fall-out,0.0
0.797979797979798,Fall-out,0.0
0.8080808080808082,Fall-out,0.0
0.8181818181818182,Fall-out,0.0
0.8282828282828284,Fall-out,0.0
0.8383838383838385,Fall-out,0.0
0.8484848484848485,Fall-out,0.0
0.8585858585858587,Fall-out,0.0
0.8686868686868687,Fall-out,0.0
0.8787878787878789,Fall-out,0.0
0.888888888888889,Fall-out,0.0
0.8989898989898991,Fall-out,0.0
0.9090909090909092,Fall-out,0.0
0.9191919191919192,Fall-out,0.0
0.9292929292929294,Fall-out,0.0
0.9393939393939394,Fall-out,0.0
0.9494949494949496,Fall-out,0.0
0.9595959595959597,Fall-out,0.0
0.9696969696969697,Fall-out,0.0
0.9797979797979799,Fall-out,0.0
0.98989898989899,Fall-out,0.0
1.0,Fall-out,0.0
Use pivot to transform the data from long to wide
Use idxmax to find the x (Threshold) of the max difference between y1 and y2 (Fall-out and Recall)
Use vlines to plot the vertical line at x from y1 to y2
Use annotate to plot the label at the midpoint of y1 and y2
g = sns.relplot(data=data, x='Threshold', y='Value', hue='Metric', kind='line')
# pivot to wide form
p = data.pivot(index='Threshold', columns='Metric', values='Value')
# find x, y1, and y2 corresponding to max difference
diff = p['Fall-out'].sub(p['Recall']).abs()
x = diff.idxmax()
y1, y2 = p.loc[x]
# plot line and label
ax = g.axes.flat[0]
ax.vlines(x, y1, y2, ls='--')
ax.annotate(f'Dist = {diff.loc[x]:.2f}', ha='left', va='center',
xy=(x, 0.5*(y1+y2)), xycoords='data',
xytext=(5, 0), textcoords='offset pixels')
The easiest way which I can think of is to create two separate lists of all values where the metric is Recall and another with all values where metric is Fall-out. This can be easily done using pandas operations as follows (Assuming the dataframe has name df) -
import math
import matplotlib.pyplot as plt
ls_metric = df['Metric'].to_list()
ls_value = df['Value'].to_list()
ls_threshold = df['Threshold'].to_list()
ls_value_recall = []
ls_value_fallout = []
ls_threshold_recall = []
ls_threshold_fallout = []
for i, j, k in zip(ls_metric, ls_value, ls_threshold):
if (i == 'Recall'):
ls_value_recall.append(j)
ls_threshold_recall.append(k)
elif(i == 'Fall-out'):
ls_value_fallout.append(j)
ls_threshold_recall.append(k)
ls_dist = []
for i, j in zip(ls_value_recall, ls_value_fallout):
ls_dist.append(math.abs(i-j))
max_diff = max(ls_dist)
location_of_max_diff = ls_dist.index(max_diff)
value_of_threshold_at_max_diff = ls_threshold_recall[location_of_max_diff]
value_of_recall_at_max_diff = ls_value_recall[location_of_max_diff]
value_of_fallout_at_max_diff = ls_value_fallout[location_of_max_diff]
x_values = [value_of_threshold_at_max_diff, value_of_threshold_at_max_diff]
y_values = [value_of_recall_at_max_diff, value_of_fallout_at_max_diff]
plt.plot(x_values, y_values)
Certain Assumptions - The Threshold Values are the same and same number of readings are present for both metrics which I think is true having had a brief glance at the data but if not I believe it's still pretty easy to modify the code
You can add this plot to your own figure for which the syntax is readily available, now as far as the label for the line is concerned one way to do this is use matplotlib.pyplot.text to add a textbox but with that you'll need to tweak with the location to get the desired location another way to do this would be to add it as a legend only

Number format python

I want to have the legend of the plot shown with the value in a list. But what I get is the element index but not the value itself. I dont know how to fix it. I'm referring to the plt.plot line. Thanks for the help.
import matplotlib.pyplot as plt
import numpy as np
x = np.random.random(1000)
y = np.random.random(1000)
n = len(x)
d_ij = []
for i in range(n):
for j in range(i+1,n):
a = np.sqrt((x[i]-x[j])**2+(y[i]-y[j])**2)
d_ij.append(a)
epsilon = np.linspace(0.01,1,num=10)
sigma = np.linspace(0.01,1,num=10)
def lj_pot(epsi,sig,d):
result = []
for i in range(len(d)):
a = 4*epsi*((sig/d[i])**12-(sig/d[i])**6)
result.append(a)
return result
for i in range(len(epsilon)):
for j in range(len(sigma)):
a = epsilon[i]
b = sigma[j]
plt.cla()
plt.ylim([-1.5, 1.5])
plt.xlim([0, 2])
plt.plot(sorted(d_ij),lj_pot(epsilon[i],sigma[j],sorted(d_ij)),label = 'epsilon = %d, sigma =%d' %(a,b))
plt.legend()
plt.savefig("epsilon_%d_sigma_%d.png" % (i,j))
plt.show()
Your code is a bit unpythonic, so I tried to clean it up to the best of my knowledge. numpy.random.random and numpy.random.uniform(0, 1) are basically the same, however, the latter also allows you to pass the shape of the return array that you would like to have, in this case an array with 1000 rows and two columns (1000, 2). I then use some magic to assign the two colums of the return array to x and y in the same line, respectively.
numpy.hypot does as the name suggests and calculates the hypothenuse of x and y. It can also do that for each entry of arrays with the same size, saving you the for loops, which you should try to aviod in Python since they are pretty slow.
You used plt for all your plotting, which is fine as long as you only have one figure, but I would recommend to be as explicit as possible, according to one of Python's key notions:
explicit is better than implicit.
I recommend you read through this guide, in particular the section called 'Stateful Versus Stateless Approaches'. I changed your commands accordingly.
It is also very unpythonic to loop over items of a list using the index of the item in the list like you did (for i in range(len(list)): item = list[i]). You can just reference the item directly (for item in list:).
Lastly I changed your formatted strings to the more convenient f-strings. Have a read here.
import matplotlib.pyplot as plt
import numpy as np
def pot(epsi, sig, d):
result = 4*epsi*((sig/d)**12 - (sig/d)**6)
return result
# I am not sure why you would create the independent variable this way,
# maybe you are simulating something. In that case, the code below is
# simpler than your version and should achieve the same.
# x, y = zip(*np.random.uniform(0, 1, (1000, 2)))
# d = np.array(sorted(np.hypot(x, y)))
# If you only want to plot your pot function then creating the value range
# like this is just fine.
d = np.linspace(0.001, 1, 1000)
epsilons = sigmas = np.linspace(0.01, 1, num=10)
fig, ax = plt.subplots()
ax.set_xlim([0, 2])
ax.set_ylim([-1.5, 1.5])
line = None
for epsilon in epsilons:
for sigma in sigmas:
if line is None:
line = ax.plot(
d, pot(epsilon, sigma, d),
label=f'epsilon = {epsilon}, sigma = {sigma}'
)[0]
fig.legend()
else:
line.set_data(d, pot(epsilon, sigma, d))
# plt.savefig(f"epsilon_{epsilon}_sigma_{sigma}.png")
fig.show()

How to plot routes between pairs of starting and ending geospatial points using Folium?

I am having a pandas DataFrames with latitude longitude of start and end of a route, so these are my columns ('origin_lat, 'origin_lon',destination_lat','destination_lon').
I am able to plot the locations on Folium map but I am looking for a way to plot the routes between each location.
Here is the code I am using:
m = folium.Map([16.7, 81.095], zoom_start=11)
m
# mark each origin as a point
for index, row in df.iterrows():
folium.CircleMarker([row['origin_lat'], row['origin_lng']],
radius=15,
fill_color="#3db7e4", # divvy color
).add_to(m)
for index, row in df.iterrows():
folium.CircleMarker([row['destination_lat'], row['destination_lng']],
radius=15,
fill_color="red", # divvy color
).add_to(m)
#add routes
folium.PolyLine([list(zip(df.origin_lat, df.origin_lng)),list(zip(df.destination_lat, df.destination_lng))], line_opacity = 0.5, color='white',line_weight=5).add_to(m)
The code I am using is connecting all the origin locations together and all the destination locations together but I want to plot routes between origin and destination locations instead. Any way I can fix it?
If I understood your question correctly, I believe I have your solution
Some imports
import pandas as pd
import numpy as np
import folium
Some sample data
centroid_lat = 16.7
centroid_lon = 81.095
x = .1
n = 10
o_lats = np.random.uniform(low=centroid_lat - x, high=centroid_lat + x, size=(n,))
o_lons = np.random.uniform(low=centroid_lon - x, high=centroid_lon + x, size=(n,))
d_lats = np.random.uniform(low=centroid_lat - x, high=centroid_lat + x, size=(n,))
d_lons = np.random.uniform(low=centroid_lon - x, high=centroid_lon + x, size=(n,))
df = pd.DataFrame({'origin_lng' : o_lons, 'origin_lat' : o_lats,
'destination_lng': d_lons, 'destination_lat': d_lats})
print(df.head())
destination_lat destination_lng origin_lat origin_lng
0 16.797057 81.074000 16.660164 81.080038
1 16.615371 81.001004 16.772645 80.997770
2 16.784289 81.117082 16.670008 81.032719
3 16.686201 81.184775 16.787999 81.189585
4 16.757704 81.127280 16.720080 81.178466
Then the map. No need for two for loops and I'm creating a line each iteration
m = folium.Map([centroid_lat, centroid_lon], zoom_start=11)
for _, row in df.iterrows():
folium.CircleMarker([row['origin_lat'], row['origin_lng']],
radius=15,
fill_color="#3db7e4", # divvy color
).add_to(m)
folium.CircleMarker([row['destination_lat'], row['destination_lng']],
radius=15,
fill_color="red", # divvy color
).add_to(m)
folium.PolyLine([[row['origin_lat'], row['origin_lng']],
[row['destination_lat'], row['destination_lng']]]).add_to(m)
m

Python merge datasets X1(t), X2(t) -> X1(X2)

I have some datasets (lets stay at 2 here) which are dependent on a common variable t, like X1(t) and X2(t). However X1(t) and X2(t) don't have to share the same t values or even have the same amount of datapoints.
For example they could look like:
t1 = [2,6,7,8,10,13,14,16,17]
X1 = [10,10,10,20,20,20,30,30,30]
t2 = [3,4,5,6,8,10,11,14,15,16]
X2 = [95,100,100,105,158,150,142,196,200,204]
I am trying to create a new dataset YNew(XNew) (=X2(X1)) such that both datasets are linked without the shared variable t.
In this case it should look like:
XNew = [10,20,30]
YNew = [100,150,200]
where to every occuring X1-value a corresponding X2-value (a mean value) is assigned.
Is there an easy already known way to achieve this(maybe with pandas)?
My first guess would be to find all t-values for a certain X1-value (in the example case the X1-value 10 would lie in the range 2,...,7) and then look for all X2-values in that range and get their mean value. Then you should be able to assign YNew(XNew).
Thanks for every advice!
Update:
I added a graph, so maybe my intentions are a bit more clear. I want to assign the mean X2-value to the corresponding X1-value in the marked regions (where the same X1-values occur).
graph corresponding to example lists
alright, I just tried to implement what I mentioned and it works as I liked it.
Although I think that some things are still a little clumsy...
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# datasets to treat
t1 = [2,6,7,8,10,13,14,16,17]
X1 = [10,10,10,20,20,20,30,30,30]
t2 = [3,4,5,6,8,10,11,14,15,16]
X2 = [95,100,100,105,158,150,142,196,200,204]
X1Series = pd.Series(X1, index = t1)
X2Series = pd.Series(X2, index = t2)
X1Values = X1Series.drop_duplicates().values #returns all occuring values of X1 without duplicates as array
# lists for results
XNew = []
YNew = []
#find for every occuring value X1 the mean value of X2 in the range of X1
for value in X1Values:
indexpos = X1Series[X1Series == value].index.values
max_t = indexpos[indexpos.argmax()] # get max and min index of the range of X1
min_t =indexpos[indexpos.argmin()]
print("X1 = "+str(value)+" occurs in range from "+str(min_t)+" to "+str(max_t))
slicedX2 = X2Series[(X2Series.index >= min_t) & (X2Series.index <= max_t)] # select range of X2
print("in this range there are following values of X2:")
print(slicedX2)
mean = slicedX2.mean() #calculate mean value of selection and append extracted values
print("with the mean value of: " + str(mean))
XNew.append(value)
YNew.append(mean)
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
ax1.plot(t1, X1,'ro-',label='X1(t)')
ax1.plot(t2, X2,'bo',label='X2(t)')
ax1.legend(loc=2)
ax1.set_xlabel('t')
ax1.set_ylabel('X1/X2')
ax2.plot(XNew,YNew,'ro-',label='YNew(XNew)')
ax2.legend(loc=2)
ax2.set_xlabel('XNew')
ax2.set_ylabel('YNew')
plt.show()

Adding a single label to the legend for a series of different data points plotted inside a designated bin in Python using matplotlib.pyplot.plot()

I have a script for plotting astronomical data of redmapping clusters using a csv file. I could get the data points in it and want to plot them using different colors depending on their redshift values: I am binning the dataset into 3 bins (0.1-0.2, 0.2-0.25, 0.25,0.31) based on the redshift.
The problem arises with my code after I distinguish to what bin the datapoint belongs: I want to have 3 labels in the legend corresponding to red, green and blue data points, but this is not happening and I don't know why. I am using plot() instead of scatter() as I also had to do the best fit from the data in the same figure. So everything needs to be in 1 figure.
import numpy as np
import matplotlib.pyplot as py
import csv
z = open("Sheet4CSV.csv","rU")
data = csv.reader(z)
x = []
y = []
ylow = []
yupp = []
xlow = []
xupp = []
redshift = []
for r in data:
x.append(float(r[2]))
y.append(float(r[5]))
xlow.append(float(r[3]))
xupp.append(float(r[4]))
ylow.append(float(r[6]))
yupp.append(float(r[7]))
redshift.append(float(r[1]))
from operator import sub
xerr_l = map(sub,x,xlow)
xerr_u = map(sub,xupp,x)
yerr_l = map(sub,y,ylow)
yerr_u = map(sub,yupp,y)
py.xlabel("$Original\ Tx\ XCS\ pipeline\ Tx\ keV$")
py.ylabel("$Iterative\ Tx\ pipeline\ keV$")
py.xlim(0,12)
py.ylim(0,12)
py.title("Redmapper Clusters comparison of Tx pipelines")
ax1 = py.subplot(111)
##Problem starts here after the previous line##
for p in redshift:
for i in xrange(84):
p=redshift[i]
if 0.1<=p<0.2:
ax1.plot(x[i],y[i],color="b", marker='.', linestyle = " ")#, label = "$z < 0.2$")
exit
if 0.2<=p<0.25:
ax1.plot(x[i],y[i],color="g", marker='.', linestyle = " ")#, label="$0.2 \leq z < 0.25$")
exit
if 0.25<=p<=0.3:
ax1.plot(x[i],y[i],color="r", marker='.', linestyle = " ")#, label="$z \geq 0.25$")
exit
##There seems nothing wrong after this point##
py.errorbar(x,y,yerr=[yerr_l,yerr_u],xerr=[xerr_l,xerr_u], fmt= " ",ecolor='magenta', label="Error bars")
cof = np.polyfit(x,y,1)
p = np.poly1d(cof)
l = np.linspace(0,12,100)
py.plot(l,p(l),"black",label="Best fit")
py.plot([0,15],[0,15],"black", linestyle="dotted", linewidth=2.0, label="line $y=x$")
py.grid()
box = ax1.get_position()
ax1.set_position([box.x1,box.y1,box.width, box.height])
py.legend(loc='center left',bbox_to_anchor=(1,0.5))
py.show()
In the 1st 'for' loop, I have indexed every value 'p' in the list 'redshift' so that bins can be created using 'if' statement. But if I add the labels that are hashed out against each py.plot() inside the 'if' statements, each data point 'i' that gets plotted in the figure as an intersection of (x[i],y[i]) takes the label and my entire legend attains in total 87 labels (including the 3 mentioned in the code at other places)!!!!!!
I essentially need 1 label for each bin...
Please tell me what needs to done after the bins are created and py.plot() commands used...Thanks in advance :-)
Sorry I cannot post my image here due to low reputation!
The data 'appended' for x, y and redshift lists from the csv file are as follows:
x=[5.031,10.599,10.589,8.548,9.089,8.675,3.588,1.244,3.023,8.632,8.953,7.603,7.513,2.917,7.344,7.106,3.889,7.287,3.367,6.839,2.801,2.316,1.328,6.31,6.19,6.329,6.025,5.629,6.123,5.892,5.438,4.398,4.542,4.624,4.501,4.504,5.033,5.068,4.197,2.854,4.784,2.158,4.054,3.124,3.961,4.42,3.853,3.658,1.858,4.537,2.072,3.573,3.041,5.837,3.652,3.209,2.742,2.732,1.312,3.635,2.69,3.32,2.488,2.996,2.269,1.701,3.935,2.015,0.798,2.212,1.672,1.925,3.21,1.979,1.794,2.624,2.027,3.66,1.073,1.007,1.57,0.854,0.619,0.547]
y=[5.255,10.897,11.045,9.125,9.387,17.719,4.025,1.389,4.152,8.703,9.051,8.02,7.774,3.139,7.543,7.224,4.155,7.416,3.905,6.868,2.909,2.658,1.651,6.454,6.252,6.541,6.152,5.647,6.285,6.079,5.489,4.541,4.634,8.851,4.554,4.555,5.559,5.144,5.311,5.839,5.364,3.18,4.352,3.379,4.059,4.575,3.914,5.736,2.304,4.68,3.187,3.756,3.419,9.118,4.595,3.346,3.603,6.313,1.816,4.34,2.732,4.978,2.719,3.761,2.623,2.1,4.956,2.316,4.231,2.831,1.954,2.248,6.573,2.276,2.627,3.85,3.545,25.405,3.996,1.347,1.679,1.435,0.759,0.677]
redshift = [0.12,0.25,0.23,0.23,0.27,0.26,0.12,0.27,0.17,0.18,0.17,0.3,0.23,0.1,0.23,0.29,0.29,0.12,0.13,0.26,0.11,0.24,0.13,0.21,0.17,0.2,0.3,0.29,0.23,0.27,0.25,0.21,0.11,0.15,0.1,0.26,0.23,0.12,0.23,0.26,0.2,0.17,0.22,0.26,0.25,0.12,0.19,0.24,0.18,0.15,0.27,0.14,0.14,0.29,0.29,0.26,0.15,0.29,0.24,0.24,0.23,0.26,0.29,0.22,0.13,0.18,0.24,0.14,0.24,0.24,0.17,0.26,0.29,0.11,0.14,0.26,0.28,0.26,0.28,0.27,0.23,0.26,0.23,0.19]
Working with numerical data like this, you should really consider using a numerical library, like numpy.
The problem in your code arises from processing each record (a coordinate (x,y) and the corresponding value redshift) one at a time. You are calling plot for each point, thereby creating legends for each of those 84 datapoints. You should consider your "bins" as groups of data that belong to the same dataset and process them as such. You could use "logical masks" to distinguish between your "bins", as shown below.
It's also not clear why you call exit after each plotting action.
import numpy as np
import matplotlib.pyplot as plt
x = np.array([5.031,10.599,10.589,8.548,9.089,8.675,3.588,1.244,3.023,8.632,8.953,7.603,7.513,2.917,7.344,7.106,3.889,7.287,3.367,6.839,2.801,2.316,1.328,6.31,6.19,6.329,6.025,5.629,6.123,5.892,5.438,4.398,4.542,4.624,4.501,4.504,5.033,5.068,4.197,2.854,4.784,2.158,4.054,3.124,3.961,4.42,3.853,3.658,1.858,4.537,2.072,3.573,3.041,5.837,3.652,3.209,2.742,2.732,1.312,3.635,2.69,3.32,2.488,2.996,2.269,1.701,3.935,2.015,0.798,2.212,1.672,1.925,3.21,1.979,1.794,2.624,2.027,3.66,1.073,1.007,1.57,0.854,0.619,0.547])
y = np.array([5.255,10.897,11.045,9.125,9.387,17.719,4.025,1.389,4.152,8.703,9.051,8.02,7.774,3.139,7.543,7.224,4.155,7.416,3.905,6.868,2.909,2.658,1.651,6.454,6.252,6.541,6.152,5.647,6.285,6.079,5.489,4.541,4.634,8.851,4.554,4.555,5.559,5.144,5.311,5.839,5.364,3.18,4.352,3.379,4.059,4.575,3.914,5.736,2.304,4.68,3.187,3.756,3.419,9.118,4.595,3.346,3.603,6.313,1.816,4.34,2.732,4.978,2.719,3.761,2.623,2.1,4.956,2.316,4.231,2.831,1.954,2.248,6.573,2.276,2.627,3.85,3.545,25.405,3.996,1.347,1.679,1.435,0.759,0.677])
redshift = np.array([0.12,0.25,0.23,0.23,0.27,0.26,0.12,0.27,0.17,0.18,0.17,0.3,0.23,0.1,0.23,0.29,0.29,0.12,0.13,0.26,0.11,0.24,0.13,0.21,0.17,0.2,0.3,0.29,0.23,0.27,0.25,0.21,0.11,0.15,0.1,0.26,0.23,0.12,0.23,0.26,0.2,0.17,0.22,0.26,0.25,0.12,0.19,0.24,0.18,0.15,0.27,0.14,0.14,0.29,0.29,0.26,0.15,0.29,0.24,0.24,0.23,0.26,0.29,0.22,0.13,0.18,0.24,0.14,0.24,0.24,0.17,0.26,0.29,0.11,0.14,0.26,0.28,0.26,0.28,0.27,0.23,0.26,0.23,0.19])
bin3 = 0.25 <= redshift
bin2 = np.logical_and(0.2 <= redshift, redshift < 0.25)
bin1 = np.logical_and(0.1 <= redshift, redshift < 0.2)
plt.ion()
labels = ("$z < 0.2$", "$0.2 \leq z < 0.25$", "$z \geq 0.25$")
colors = ('r', 'g', 'b')
for bin, label, co in zip( (bin1, bin2, bin3), labels, colors):
plt.plot(x[bin], y[bin], color=co, ls='none', marker='o', label=label)
plt.legend()
plt.show()

Categories