Calculate subplot adjustment - python

So I have some data calculated that now should be visualised. For each data element, I want to place a separate subplot so that the whole figure is as compact as possible. Here's an example for five elements:
Here's a prototype I came up with for an arbitrary elements count:
import matplotlib.pyplot as plt
import numpy as np
import math
data = ... # some list of pairs of numpy arrays, for x and y axes
size = len(data)
cols = math.floor(math.sqrt(size))
rows = math.ceil(size / cols)
f, diags = plt.subplots(rows, cols)
for (row, col), diag in np.ndenumerate(diags):
dataIdx = row * cols + col
if dataIdx < size:
x = data[dataIdx][0]
y = data[dataIdx][1]
diag.scatter(x, y)
diag.set_title('Regressor {}'.format(dataIdx + 1))
else: # discard empty subplots
f.delaxes(diag)
f.show()
A short explanation: for compactness, I'm trying to adjust the plots in form of a square matrix if possible. If not, I add another row for the remaining diagrams. Then I iterate the diagrams, calculate the according position of data element and plot its values. If no data element is found for the diagram, it means the diagram is a remainder from the last row and can be discarded.
However, this is the code I would probably write in C++ or Java, the question is, what would the the pythonic way?
Also, what would be the best solution for this when iterating over data instead of diagrams? I could of course calculate the diagram's row/column from the element index the same way I did in the initial rows/columns calculation, but maybe there's a better way to do this...
Thanks in advance!

I would likely create the plot like this:
size = len(data)
cols = round(math.sqrt(size))
rows = cols
while rows * cols < size:
rows += 1
f, ax_arr = plt.subplots(rows, cols)
ax_arr = ax_arr.reshape(-1)
for i in range(len(ax_arr)):
if i >= size:
ax_arr[i].axis('off')
x = data[i][0]
y = data[i][1]
ax_arr[i].scatter(x,y)

Related

Selecting a subset of columns in a matrix using values stored in another matrix in Python

I am trying to subset a matrix by using values from another smaller matrix. The number of rows in each are the same, but the smaller matrix has fewer columns. Each column in the smaller matrix contains the value of the column in the larger matrix that should be referenced. Here is what I have done, along with comments that hopefully describe this better, along with what I have tried. (The wrinkle in this is that the values of the columns to be used in each row change...)
I have tried Google, searching on stackoverflow, etc and can't find what I'm looking for. (The closest I came was something in sage called matrix_from_columns, which isn't being used here) So I'm probably making a very simple referencing error.
TIA,
mconsidine
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
#Problem: for each row in a matrix/image I need to replace
# a value in a particular column in that row by a
# weighted average of some of the values on either
# side of that column in that row. The wrinkle
# is that the column that needs to be changed may
# vary from row to row. The columns that need to
# have their values changes is stored in an array.
#
# How do I do something like:
# img[:, selectedcolumnarray] = somefunction(img,targetcolumnmatrix)
#
# I can do this for setting the selectedcolumnarray to a value, like 0
# But I am not figuring out how to select the targeted values to
# average.
#dimensions of subset of the matrix/image that will be averaged
rows = 7
columns = 5
#weights that will be used to average surrounding values
the_weights = np.ones((rows,columns)).astype(float)*(1/columns)
print(the_weights)
#make up some data to create a set of column
# values that vary by row
y = np.asarray(range(0,rows)).astype(float)
x = -0.095*(y**2) - 0.05*y + 12.123
fit=[x.astype(int),x-x.astype(int),y]
print(np.asarray(fit)[0])
#create a test array, eg "image' of 20 columns that will have
# values in targeted columns replaced
testarray = np.asarray(range(1,21))
img = np.ones((rows,20)).astype(np.uint16)
img = img*testarray.T #give it some values
print(img)
#values of the rows that will be replaced
targetcolumn = np.asarray(fit)[0].astype(int)
print(targetcolumn)
#calculate the range of columns in each row that
# will be used in the averaging
startcol = targetcolumn-2
endcol = targetcolumn+2
testcoords=np.linspace(startcol,endcol,5).astype(int).T
#this is the correct set of columns in the corresponding
# row to use for averaging
print(testcoords)
img2=img.copy()
#this correctly replaces the targetcolumn values with 0
# but I want to replace them with the sum of the values
# in the respective row of testcoords, weighted by the_weights
img2[np.arange(rows),targetcolumn]=0
#so instead of selecting the one column, I want to select
# the block of the image represented by testcoords, calculate
# a weighted average for each row, and use those values instead
# of 0 to set the values in targetcolumn
#starting again with the 7x20 (rowsxcolumns) "image"
img3=img.copy()
#this gives me the wrong size, ie 7,7,5 when I think I want 7,5;
print(testcoords.shape)
#I thought "take" might help, but ... nope
#img3=np.take(img,testcoords,axis=1)
#something here maybe??? :
#https://stackoverflow.com/questions/40084931/taking-subarrays-from-numpy-array-with-given-stride-stepsize
# but I can't figure out what
##### plot surface to try to visualize what is going on ####
'''
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
# Make data.
X = np.arange(0, 20, 1)
Y = np.arange(0, rows, 1)
X, Y = np.meshgrid(X, Y)
Z = img2
# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
linewidth=0, antialiased=False)
# Customize the z axis.
ax.set_zlim(0, 20)
ax.zaxis.set_major_locator(LinearLocator(10))
# A StrMethodFormatter is used automatically
ax.zaxis.set_major_formatter('{x:.02f}')
# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()
It turns out that "take_along_axis" does the trick:
imgsubset = np.take_along_axis(img3,testcoords,axis=1)
print(imgsubset)
newvalues = imgsubset * the_weights
print(newvalues)
newvalues = np.sum(newvalues, axis=1)
print(newvalues)
img3[np.arange(rows),targetcolumn] = np.round(newvalues,0)
print(img3)
(It becomes more obvious when non trivial weights are used.)
Thanks for listening...
mconsidine

Optimizing a simple Photon Detection Simulation

I am a medical physics student trying to simulate photon detection - I succeeded (below) but I want to make it better by speeding it up: it currently takes 50 seconds to run and I want it to run in some fraction of that time. I assume someone more knowledgeable in Python could optimize it to complete within less than 10 seconds (without reducing num_photons_detected values). Thank you very much for trying out this little optimization challenge.
from random import seed
from random import random
import random
import matplotlib.pyplot as plt
import numpy as np
rows, cols = (25, 25)
num_photons_detected = [10**3, 10**4, 10**5, 10**6, 10**7]
lesionPercentAboveNoiseLevel = [1, 0.20, 0.10, 0.05]
index_range = np.array([i for i in range(rows)])
for l in range(len(lesionPercentAboveNoiseLevel)):
pixels = np.array([[0.0 for i in range(cols)] for j in range(rows)])
for k in range(len(num_photons_detected)):
random.seed(a=None, version=2)
photons_random_pixel_choice = np.array([random.choice(index_range) for z in range(rows)])
counts = 0
while num_photons_detected[k] > counts:
for i in photons_random_pixel_choice:
photons_random_pixel_choice = np.array([random.choice(index_range) for z in range(rows)]) #further ensures random pixel selection
for j in photons_random_pixel_choice:
pixels[i,j] +=1
counts +=1
plt.imshow(pixels, cmap="gray") #in the resulting images/graphs, x is on the vertical and y on the horizontal
plt.show()
I think that, aside from efficiency issues, a problem with the code is that it does not select the positions of photons truly at random. Instead, it selects rows numbers, and then for each selected row, it picks column numbers where photons will be observed in that row. As a result, if a row number is not selected, there will be no photons in that row at all, and if the same row is selected several times, there will be many photons in it. This is visible in the produced plots which have a clear pattern of lighter and darker rows:
Assuming that this is unintended and that each pixel should have equal chances of being selected, here is a function generating an array of a given size, with a given number of randomly selected pixels:
import numpy as np
def generate_photons(rows, cols, num_photons):
rng = np.random.default_rng()
indices = rng.choice(rows*cols, num_photons)
np.add.at(pix:=np.zeros(rows*cols), indices, 1)
return pix.reshape(rows, cols)
You can use it to produce images with specified parameters. E.g.:
import matplotlib.pyplot as plt
pixels = generate_photons(rows=25, cols=25, num_photons=10**4)
plt.imshow(pixels, cmap="gray")
plt.show()
gives:
photons_random_pixel_choice = np.array([random.choice(index_range) for z in range(rows)])
It seems like the goal here is:
Use a pre-made sequence of integers, 0 to 24 inclusive, to select one of those values.
Repeat that process 25 times in a list comprehension, to get a Python list of 25 random values in that range.
Make a 1-d Numpy array from those results.
This is very much missing the point of using Numpy. If we want integers in a range, then we can directly ask for those. But more importantly, we should let Numpy do the looping as much as possible when using Numpy data structures. This is where it pays to read the documentation:
size: int or tuple of ints, optional
Output shape. If the given shape is, e.g., (m, n, k), then m * n * k samples are drawn. Default is None, in which case a single value is returned.
So, just make it directly: photons_random_pixel_choice = random.integers(rows, size=(rows,)).

How to Write a plt.scatter(x, y) function in one line where y=function of x

I was plotting a scatter plot to show null values in dataframe. As you can see the plt.scatter() function is not expressive enough. Relation between list(range(0,1200)) and 'a' is not clear unless you see the previous lines. Can the plt.scatter(x,y) be written in a more explicit way where it could be easily understood how x and y is related. Like if somebody only see the plt.scatter(x,y) , they would understand what it is about.
a = []
for i in range(0,1200):
feature_with_na = [feature for feature in df.columns if df[feature].isnull().sum()>i]
a.append(len(feature_with_na))
plt.scatter(list(range(0,1200)), a)
On your x axis you have the number, then on the y-axis you want to plot the number of columns in your DataFrame that have more than that number of null values.
Instead of your loop you can count the number of null values within each column and use numpy.broadcasting, ([:, None]), to compare with an array of your numbers. This allows you to specify an xarr of the numbers, then you use that same array in the comparison.
Sample Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
df = pd.DataFrame(np.random.choice([1,2,3,4,5,np.NaN], (100,10)))
Code
# Range of 'x' values to consider
xarr = np.arange(0, 100)
plt.scatter(xarr, (df.isnull().sum().to_numpy()>xarr[:, None]).sum(axis=1))
ALollz answer is good, but here's a less numpy-heavy alternative if that's your thing:
feature_null_counts = df.isnull().sum()
n_nulls = list(range(100))
features_with_n_nulls = [sum(feature_null_counts > n) for n in n_nulls]
plt.scatter(n_nulls, features_with_n_nulls)

Looking for a way to adjust the values of one array based on another array?

I started with a set of bivariate data. My goal is to first find points in that data set for which the y-values are outliers. Then, I wanted to create a new data set that included not only the outlier points, but also any points with an x-value of within 0.01 of any given outlier point.
Then (if possible) I want to subtract the original outlier x-values from the new x-set, so that I have a group of points with x-values of between -0.01 and 0.01, with x-value now indicating distance from an original outlier x-value.
I have this code:
import numpy as np
mean = np.mean(y)
SD = np.std(y)
x_indices = [i for i in range(len(y)) if ((y[i]) > ((2*SD)+mean))]
expanded_indices = [i for i in range(len(x)) if np.any((abs(x[i] - x[x_indices])) < 0.01)]
This worked great, and now I can call (and plot) x and y using the indices:
plt.plot(x[expanded_indices],y[expanded_indices])
However, I have no idea how to subtract the original "x_indices" values to get an x range of -0.01 to 0.01, since everything I tried failed.
I want to do something like what I have below, except I know that I can't subtract two arrays of different sizes, and I'm worried I can't use np.any in this context either.
x_values = [(x[expanded_indices] - x[indices]) if np.any((abs(x[expanded_indices] - x[indices])) < 0.01)]
Any ideas? I'm sorry this is so long -- I'm very new at this and pretty lost. I've been giving it a go for the last few hours and any assistance would be appreciated. Thanks!
sample data could be as follows:
x =[0,0.994,0.995,0.996,0.997,0.998,1.134,1.245,1.459,1.499,1.500,1.501,2.103,2.104,2.105,2.106]
y =
[1.5,1.6,1.5,1.6,10,1.5,1.5,1.5,1.6,1.6,1.5,1.6,1.5,11,1.6,1.5]
Once you have the set with y-outliers values and the set with the expanded values, you can go over the whole second set with a for loop and subtract the corresponding 1st set value using 2 For() loops:
import numpy as np
x =np.array([0,0.994,0.995,0.996,0.997,0.998,1.134,1.245,1.459,1.499,1.500,1.501,2.103,2.104,2.105,2.106])
y = np.array([1.5,1.6,1.5,1.6,10,1.5,1.5,1.5,1.6,1.6,1.5,1.6,1.5,11,1.6,1.5])
mean = np.mean(y)
SD = np.std(y)
# elements with y-element outside defined region
indices = [i for i in range(len(y)) if ((y[i]) > ((2*SD)+mean))]
my_1st_set = x[indices]
# Set with values within 0.01 difference with 1st set points
expanded_indices = [i for i in range(len(x)) if np.any((abs(x[i] - x[x_indices])) < 0.01)]
my_2nd_set = x[expanded_indices]
# A final set with the subtracted values from the 2nd set
my_final_set = my_2nd_set
for i in range(my_final_set.size):
for j in range(my_1st_set.size):
if abs(my_final_set[i] - my_1st_set[j]) < 0.01:
my_final_set[i] = x[i] - my_1st_set[j]
break
my_final_set is a numpy array with the resulting values of subtracting the original expanded_indices values with their corresponding value of the first set
Let's see if I understood you correctly. This code should find the outliers, and put an array into res for each outlier.
import numpy as np
mean = np.mean(y)
SD = np.std(y)
x = np.array([0,0.994,0.995,0.996,0.997,0.998,1.134,1.245,1.459,1.499,1.500,1.501,2.103,2.104,2.105,2.106])
y = np.array([1.5,1.6,1.5,1.6,10,1.5,1.5,1.5,1.6,1.6,1.5,1.6,1.5,11,1.6,1.5])
outlier_indices = np.abs(y - mean) > 2*SD
res = []
for x_at_outlier in x[np.flatnonzero(outlier_indices)]:
part_res = x[np.abs(x - x_at_outlier) < 0.01]
part_res -= np.mean(part_res)
res.append(part_res)
res is now a list of arrays, with each array containing the values around one outlier. Perhaps it is easier to continue working with the data in this format?
If you want all of them in one numpy array:
res = np.hstack(res)

Count frequencies of x, y coordinates, display in 2D and plot

I am trying to plot the frequency of how often viral biological sequences combination of isolation year differences and nucleotide differences occurs. I am trying to find an elegant way to do it have having trouble.
So I have an alignment and I compare each sequence against each other to get an integer value of how different they are. I also check to see how different their years of isolation are. So for a set of sequences that are isolated two years apart and have three differences you get the coordinates (2,3). I want to count how many times (2,3) occurs as well as all other combinations and plot it (and get the plot data). I have been trying to convert a list of frequencies to a dataframe to no avail and I am wondering if there is a better way to do it.
I can show some code but I am not sure this is the best way so I want to hear other ideas.
One problem is how to represent the frequencies in the beginning. I can create a list of all of the occurrences or create a dictionary of the occurrences and increment a counter.
Sample data:
(year difference, sequence residue differences):
(1,2), (2,5), (1,2), (5, 5), (4, 5)
Output is shown in the picture but it does NOT have to be in a table structure. CSV is preferred.
I'm heavily borrowing the table construction of this post.
The difference here is in constructing the array data. By initialising an array with zeros, for every coordinate (i, j), you increment that array element by one, to represent the incremented frequency.
zip(*coords) will group all is together in a tuple and all js in another. By finding the maximum value in each, we know the size of our array. Note, this must be bigger by 1 from x and y to account for 0, i.e from 0 to x is x+1 rows.
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table
def table_plot(data):
fig, ax = plt.subplots()
ax.set_axis_off()
tb = Table(ax, bbox=[0,0,1,1])
nrows, ncols = data.shape
width, height = 1.0 / ncols, 1.0 / nrows
for (i, j), val in np.ndenumerate(data):
tb.add_cell(i, j, width, height, text=str(val) if val else '', loc='center')
for i in range(data.shape[0]):
tb.add_cell(i, -1, width, height, text=str(i), loc='right',
edgecolor='none', facecolor='none')
for i in range(data.shape[1]):
tb.add_cell(-1, i, width, height/2, text=str(i), loc='center',
edgecolor='none', facecolor='none')
tb.set_fontsize(16)
ax.add_table(tb)
return fig
coords = ((1,2), (2,5), (1,2), (5, 5), (4, 5))
# get maximum value for both x and y to allocate the array
x, y = map(max, zip(*coords))
data = np.zeros((x+1, y+1), dtype=int)
for i, j in coords:
data[i,j] += 1
table_plot(data)
plt.show()
Output:
Assuming your (year, discrepancy) tuples are in a list called samples as in the example below
import random
samples = [(random.randint(0,10), random.randint(0,10)) for i in range(100) ]
you can get the frequency of each pair as described in this other stackoverflow post How to count the frequency of the elements in a list?
import collections
counter=collections.Counter(samples)
To visualize this frequency table, you can convert it to a numpy matrix and use matshow from matplotlib
import numpy as np
import matplotlib.pyplot as plt
x_max = max([x[0] for x in samples])
y_max = max([x[1] for x in samples])
freq = np.zeros((x_max+1, y_max+1))
for coord, f in counter.iteritems():
freq[coord[0]][coord[1]] = f
plt.matshow(freq, cmap=plt.cm.gray)
plt.show()

Categories