Can't save all matlibplots to pdf - missing some graphs (IPython) - python

I have a script that I run in IPython, and basically takes an input .csv file of gene_names and pushes them into this for loop, where
with open('C:\Users\Work\Desktop\Book1.csv', 'rU') as f:
reader = csv.reader(f)
with PdfPages('poopyheadjoe04.pdf') as pdf:
for row in reader:
gene_name = row
probe_exclusion_keyword = []
print gene_name
The gene_name values from this list(in the .csv file) is then fed into a line, if inference_method == "approximate_random": (in Scripts.py)
with open('C:\Users\Work\Desktop\Book1.csv', 'rU') as f:
reader = csv.reader(f)
with PdfPages('poopyheadjoe04.pdf') as pdf:
for row in reader:
gene_name = row
probe_exclusion_keyword = []
print gene_name
print "Fetching probe ids for gene %s" % gene_name
probes_dict = get_probes_from_genes(gene_name)
print "Found %s probes: %s" % (len(probes_dict), ", ".join(probes_dict.values()))
if probe_exclusion_keyword:
probes_dict = {probe_id: probe_name for (probe_id, probe_name) in probes_dict.iteritems() if not args.probe_exclusion_keyword in probe_name}
print "Probes after applying exclusion cryterion: %s" % (", ".join(probes_dict.values()))
print "Fetching expression values for probes %s" % (", ".join(probes_dict.values()))
expression_values, well_ids, donor_names = get_expression_values_from_probe_ids(
probes_dict.keys())
print "Found data from %s wells sampled across %s donors" % (len(well_ids), len(set(donor_names)))
print "Combining information from selected probes"
combined_expression_values = combine_expression_values(
expression_values, method=probes_reduction_method)
print "Translating locations of the wells to MNI space"
mni_coordinates = get_mni_coordinates_from_wells(well_ids)
print "Checking values of the provided NIFTI file at well locations"
nifti_values = get_values_at_locations(
stat_map, mni_coordinates, mask_file=mask, radius=radius, verbose=True)
# preparing the data frame
names = ["NIFTI values", "%s expression" % gene_name, "donor ID"]
data = pd.DataFrame(np.array(
[nifti_values, combined_expression_values, donor_names]).T, columns=names)
data = data.convert_objects(convert_numeric=True)
len_before = len(data)
data.dropna(axis=0, inplace=True)
nans = len_before - len(data)
if nans > 0:
print "%s wells fall outside of the mask" % nans
if inference_method == "fixed":
print "Performing fixed effect analysis"
fixed_effects(data, ["NIFTI values", "%s expression" % gene_name])
**if inference_method == "approximate_random":**
print "Performing approximate random effect analysis"
approximate_random_effects(
data, ["NIFTI values", "%s expression" % gene_name], "donor ID")
print "poopy"
pdf.savefig()
plt.ion() #should i add ion() here?
if inference_method == "bayesian_random":
print "Fitting Bayesian hierarchical model"
bayesian_random_effects(
data, ["NIFTI values", "%s expression" % gene_name], "donor ID", n_samples, n_burnin)
# if __name__ == '__main__': #What exactly does this do? Start trigger for the script to run?
# main()
that triggers approximate_random_effects(in Analysis.py) to plot two graphs, the violinplot and the lmplot:
def approximate_random_effects(data, labels, group):
correlation_per_donor = {}
for donor_id in set(data[group]):
correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),
list(data[labels[1]][data[group] == donor_id]))
average_slope = np.array(correlation_per_donor.values()).mean()
t, p_val = ttest_1samp(correlation_per_donor.values(), 0)
print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val)
sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"])
plt.ylabel("Linear regression slopes between %s and %s"%(labels[0],labels[1]))
plt.axhline(0, color="red")
sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3)
plt.ion()
return average_slope, t, p_val
I'm trying to save both graphs for all the gene_names into a pdf file, by roughly following "Saving multiple figures to one pdf file in matplotlib" and the matplotlib.PdfPages approach.
However, in the pdf file, I am only getting the lmplot for all my gene_names and NOT the violin plot. What do I do to fix this?
Thanks! Help will be much appreciated!

It looks like your code is creating two figures, one for each plots, but you only call pdf.savefig() once after the second figure is created, therefore only saving the second figure.
If you want one figure per page in your pdf, you need to call pdf.savefig() twice: once after creating each plot.
I would recommend that you change the structure of your program a bit, so you can save the pdf after each plot:
def approximate_random_effects(data, labels, group):
correlation_per_donor = {}
for donor_id in set(data[group]):
correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),
list(data[labels[1]][data[group] == donor_id]))
average_slope = np.array(correlation_per_donor.values()).mean()
t, p_val = ttest_1samp(correlation_per_donor.values(), 0)
print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val)
with PdfPages('poopyheadjoe04.pdf') as pdf:
fig = plt.figure()
sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"])
plt.ylabel("Linear regression slopes between %s and %s"%(labels[0],labels[1]))
plt.axhline(0, color="red")
pdf.savefig(fig) ## Saving first figure
fig = plt.figure()
sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3)
pdf.savefig(fig) ## Saving second figure
return average_slope, t, p_val
You then need to delete in your main program the lines with PdfPages('poopyheadjoe04.pdf') as pdf:, pdf.savefig() and plt.ion().
If you need the two plots on the same pdf page, you need change the violinplot and lmplot in such a way that they use different axes on the same figure.

Related

Making python plots in a for loop that gives each plot a different title

....
1.I am making a python code that creates plots of data imported from a CITIfile. I want to run the code such that each plot made will have a different title. For example, plot one will have the title S11 Log Magnitude, the second plot will have the title S12 Log Magntitude, the third plot S12 Log Magnitude, and the fourth plot with the title S22 Log magnitude. The code I have written now will produce titles 0, 1, 2, and 3, using plt.title(str(i)). What modifications can I make to this code so that it will produce the desired plot titles in this sequence?
....
# modified based on https://github.com/feph/citidata
import citidata
import glob
import numpy as np
from numpy import *
import matplotlib.pyplot as plt
keyslist = [] # data name
datalist = [] # data arrays
M = N = 0
all_my_files = glob.glob("*.citi")
for filename in all_my_files:
M += 1
print("=== %s ===" % filename)
citi_file = citidata.genfromfile(filename)
for package in citi_file.packages:
print(package)
print(package.indep)
#print(package.deps) # suppress screen output
for key in package.deps:
N += 1
value = package.deps[key] # get data field
keyslist.append(key) # append key
datalist.append(value['data']) # append np array data
print('\n ', M, 'files read;', N, 'datasets recorded.')
print('dataset : name')
#plt.figure(0)
w = []
x = np.linspace(8, 12, 201)
for i in range(N):
fig = plt.figure(i)
print(i, ':', keyslist[i])
y = datalist[i] # data
# print(y)
test = np.abs(y)
f = sqrt(test)
mag = 20*log10(f)
print(mag)
# [S11, S21, S12,S22]
# y = np.append(mag)
plt.xlabel('Frequancy (Hz')
plt.ylabel('Log Magnitude (dB')
plt.plot(x, mag)
plt.title(str(i))
I think the only way to do this is by dictionary as there is no sequence in the name. Create a dictionary with integer key and the value being the name of the graph in the global scope:
name_dict = {
0: "S11 Log Magnitude",
1: "S12 Log Magntitude",
2: "S12 Log Magnitude",
3: "S22 Log magnitude"
}
After that, you just change the last code line to
plt.title(name_dict[i])
I hope this was helpful!
EDIT 1:
Sorry, I have changed the key number to start from 0.
EDIT 2:
Forgot commas in the dictionary and just added them

Memory management in Django

I'm doing some analysis on my Django database. I do many queries in a loop and some of these queries may return big results.
So, after a while the whole 8 GB of RAM on my EC2 instance is eaten and I cannot even ssh to the machine any longer.
I have to reboot the instance then start over again.
I tried the solution mentioned here:
https://baxeico.wordpress.com/2014/09/30/optimize-django-memory-usage/
But the queryset_iterator method seems not to work with aggregated queries.
I'm pretty sure that any single query cannot consume all 8 GB of RAM. So, this means that the old results are not deleted from memory.
How do I force a query out of the memory before the end of its loop iteration and before executing the next query?
Here is my code:
def get_users_event_distribution(monthYear, event_type=None):
title = event_type if (event_type) else 'All'
filename = 'charts/%s_%s_event_dist.png'%(monthYear, title)
filename = filename.replace(' ', '')
if os.path.isfile(filename):
print 'Chart already in file %s'%(filename)
else:
users = None
if event_type:
users = EVENT.objects.filter(time__month=monthYear.month, time__year=monthYear.year, event_type=event_type).values_list('user').annotate(count=Count('id'))
else:
users = EVENT.objects.filter(time__month=monthYear.month, time__year=monthYear.year).values_list('user').annotate(count=Count('id'))
uc = users.count()
print 'We have %d users'%(uc)
print 'Building Count Dictionary'
count_dict = dict()
for u in users:
try:
count_dict[u[1]] += 1
except:
count_dict[u[1]] = 1
count += 1
print 'Built the count dictionary with %d keys'%(len(count_dict.keys()))
fig, ax = plt.subplots(figsize=(20, 20))
bars = plt.bar(range(len(count_dict)), count_dict.values(),
align='edge')
locs, labels = plt.xticks(range(len(count_dict)), count_dict.keys())
ax.set_ylabel('# Users')
ax.set_xlabel('# %s Events' % (title))
ax.set_title('%s Event Distribution'%(title))
ax.relim()
# update ax.viewLim using the new dataLim
ax.autoscale_view()
def autolabel(rects):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height,
'%d' % int(height),
ha='center', va='bottom')
autolabel(bars)
plt.savefig(filename, bbox_inches='tight', dpi=100)
print 'saved the distribution chart to %s'%(filename)
def get_users_all_event_distribution(monthYear):
get_users_event_distribution(monthYear)
for event_type in [event_type[0] for event_type in EVENT_TYPE]:
get_users_event_distribution(monthYear, transaction_type)
I run get_users_all_event_distribution for different dates in a loop.
With more analysis, I found out that the problem was in matplot figures as stated here in this warning:
/usr/local/lib64/python2.7/site-packages/matplotlib/pyplot.py:524:
RuntimeWarning: More than 20 figures have been opened. Figures created
through the pyplot interface (matplotlib.pyplot.figure) are retained
until explicitly closed and may consume too much memory. (To control
this warning, see the rcParam figure.max_open_warning).
max_open_warning, RuntimeWarning)
I add the plt.close('all') line.

Only last graph is getting pasted in pdf file in python

I am reading the parameters from different CSV files and creating the graphs after comparing the parameters across the CSVs. The problem is only last graph is getting pasted in PDF for the last parameter.
with PdfPages('example.pdf') as pdf:
for arg in sys.argv[1:]:
file_reader= open(arg, "rt", encoding='ascii')
read = csv.reader(file_reader)
for row in read:
if operation_OnDut in row:
column_Result = row[10]
resultOfOperations_OnDut_List.append(column_Result)
buildNumber = row[0]
buildName_List.append(buildNumber)
N = len(resultOfOperations_OnDut_List)
ind = np.arange(N)
#Draw graph for operations performed in that TEST CASE
y = resultOfOperations_OnDut_List
width = .1
fig, ax = plt.subplots()
plt.bar(ind, y, width, label = column_Parameters, color="blue")
plt.xticks(ind, buildName_List)
plt.title("Performance and Scale")
plt.ylabel('Result of Operations')
plt.xlabel('Execution Builds')
plt.legend()
plt.tight_layout()
pdf.savefig()
plt.close()
resultOfOperations_OnDut_List = []
buildName_List = []
You probably got the indentation wrong...
Try
with PdfPages('example.pdf') as pdf:
for arg in sys.argv[1:]:
file_reader= open(arg, "rt", encoding='ascii')
read = csv.reader(file_reader)
for row in read:
if operation_OnDut in row:
column_Result = row[10]
....
# one level deeper
N = len(resultOfOperations_OnDut_List)
ind = np.arange(N)
#Draw graph for operations performed in that TEST CASE
...
Note that the section starting with N = len(resultOfOperations_OnDut_List) has been shifted four spaces to the left to be within the first for loop. If you want it to be within the second for loop add four more spaces.

Matplotlib plot iterating plot features

I am writing a Python script that will cycle through every ASCII file in a directory and output an individual plot of the data contained with said file. Moreover, it finds the best curve-fitting parameters and outputs those into one combined text file.
The text output works perfectly, but the plot output does not. On the first iteration, it works fine, but after that, it iterates the plot features. For example, on the third iteration of for filename in os.listdir(directory), the draws three legend labels, three fit curves, and each data point three times. Here is a screenshot:
Here is the code I am using:
for filename in os.listdir(directory):
if filename.endswith((".lc", ".lc1")):
wf = directory + "/" + filename
try:
# get the data
time, flux = getData(wf, nheader)
# fit the data
popt_sine, err_sine = sineFit(time, flux)
popt_exp, err_exp = expFit(time, flux)
# plot/save the data
plotData(filename, time, flux, popt_sine, popt_exp)
# output the data
output = outputData(popt_sine, err_sine, popt_exp, err_exp)
print(output)
except StopIteration:
raise IOError("End of File Error")
where I have defined the function plotData as:
def plotData(filename, time, flux, popt_sine, popt_exp):
t = np.linspace(time[0], time[len(time)-1], 10000)
# Sine plot
A = popt_sine[0]
B = popt_sine[1]
C = popt_sine[2]
D = popt_sine[3]
plt.plot(t, sine(t, A, B, C, D), "r", label="Sine Fit")
# Exponential plot
E = popt_exp[0]
F = popt_exp[1]
G = popt_exp[2]
H = popt_exp[3]
#plt.plot(t, exponential(t, E, F, G, H), label='Exponential Fit')
# Raw plot
plt.plot(time, flux, "ko", label = "Data")
# Plot config.
plt.xlabel("Time (sec)")
plt.ylabel("Flux")
plt.title("Mean Normalized Flux vs. Time (" + filename + ")")
plt.legend(loc='best')
# save plot
if filename.endswith(".lc"):
plt.savefig(filename[:-3] + ".jpg")
elif filename.endswith(".lc1"):
plt.savefig(filename[:-4] + ".jpg")
else:
raise ValueError("Incorrect input file type")
#plt.show()
One interesting thing I noticed: if I do plt.show() rather than trying to save the plot, wherein I view and exit each plot separately, each plot will be correct.

Biopython: How to avoid particular amino acid sequences from a protein so as to plot Ramachandran plot?

I have written a python script to plot the 'Ramachandran Plot' of Ubiquitin protein. I am using biopython. I am working with pdb files. My script is as below :
import Bio.PDB
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
phi_psi = ([0,0])
phi_psi = np.array(phi_psi)
pdb1 ='/home/devanandt/Documents/VMD/1UBQ.pdb'
for model in Bio.PDB.PDBParser().get_structure('1UBQ',pdb1) :
for chain in model :
polypeptides = Bio.PDB.PPBuilder().build_peptides(chain)
for poly_index, poly in enumerate(polypeptides) :
print "Model %s Chain %s" % (str(model.id), str(chain.id)),
print "(part %i of %i)" % (poly_index+1, len(polypeptides)),
print "length %i" % (len(poly)),
print "from %s%i" % (poly[0].resname, poly[0].id[1]),
print "to %s%i" % (poly[-1].resname, poly[-1].id[1])
phi_psi = poly.get_phi_psi_list()
for res_index, residue in enumerate(poly) :
#res_name = "%s%i" % (residue.resname, residue.id[1])
#print res_name, phi_psi[res_index]
phi_psi = np.vstack([phi_psi \
,np.asarray(phi_psi[res_index])]).astype(np.float)
#np.float - conversion to float array from object
phi, psi = np.transpose(phi_psi)
phi = np.degrees(phi)
psi = np.degrees(psi)
phi = phi[~np.isnan(phi)] # avoiding nan
psi = psi[~np.isnan(psi)]
f,ax = plt.subplots(1)
plt.title('Ramachandran Plot for Ubiquitin')
plt.xlabel('$\phi^o$', size=20,fontsize=15)
plt.ylabel('$\psi^o$ ', size=20,fontsize=15)
h=ax.hexbin(phi, psi, extent=[-180,180,-180,180],cmap=plt.cm.Blues)
#h=ax.hexbin(phi, psi,gridsize=35, extent=[-180,180,-180,180],cmap=plt.cm.Blues)
f.colorbar(h)
plt.grid()
plt.show()
I would like to modify this code so as to neglect the GLYCINE amino acid and then plot Ramachandran plot. My output is as below:
You can remove them after indexing the GLYs:
for poly_index, poly in enumerate(polypeptides):
gly_index = [i for i, res in enumerate(poly) if res.get_resname() == "GLY"]
After the main loop and phy/psi calculation, delete the points from the array:
new_phi_psi = np.delete(phi_psi, gly_index, 0)
phi, psi = np.transpose(new_phi_psi)
Remove the step where you get rid of the NaNs. Now plot the points to get something like this:
h=ax.hexbin(phi, psi, extent=[-180,180,-180,180],cmap=plt.cm.Blues)
h=ax.hexbin(n_phi, n_psi, extent=[-180,180,-180,180],cmap=plt.cm.Reds)

Categories