I have a 2 column txt file which I want to know the value of the small bump around 0.8V for (see picture). What is the best way to calculate this. Image
import numpy as np
import matplotlib.pyplot as plt
data = np.genfromtxt("220525_01-O2-LSV-LMOink-1600rpm-rep1.txt", delimiter=";", names=["x", "y"])
data_2 = np.genfromtxt("220525_01-O2-LSV-LMOink-1600rpm-rep2.txt", delimiter=";", names=["x", "y"])
plt.plot(data['x'], data['y'], label='ORR polarization repetition 1')
plt.plot(data_2['x'], data_2['y'], label='ORR polarization repetition 2')
plt.title('LSV LMO-ink')
plt.xlabel('Potential Applied ($V$)')
plt.ylabel('Current ($A$)')
plt.grid()
plt.legend()
plt.savefig('ORR.png', dpi=100)
plt.show()
I tried it with find_peak. But did not succeed.
Related
The following code is a sample showing how the problem arises.
import pandas as pd
import matplotlib.pyplot as plt
#Reading data
data = pd.read_csv("mydata.csv",parse_dates=['date'])
data = data.iloc[0:17, :]
#Plotting data
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111)
ax.plot(data['date'],data['y'],'-o')
ax.set(xlabel='Date', ylabel='y')
ax.grid()
plt.show()
The result is the following: the grid is displaced with respect to data point grid_displaced.
If I remove ,parse_dates=['date'], everything works fine grid_not_displaced.
Here is the link to the data file https://drive.google.com/file/d/1AWcyIKgtDY_xkT_gaUxsiwjq9vLGfMog/view?usp=sharing
I have two csv files with the same column names in different orders. I need to correlate columns between both csv files between their common headers and plot a correlation plot of the data. For example from file1 and file2 I want to compare the 'ASA' column and form a correlation plot from this. I have a seperate file with all of the column names I need to compare so if can loop through the millions of files I have but pandas keeps telling me I have an attribute error when I add my 'list' to the y = data2.lines.values or x = data1.lines.values. I tried looping and making matching conditions but nothing has worked.
import csv
import pandas as pd
import numpy as np
from pandas import DataFrame
import matplotlib.pyplot as plt
from scipy import linspace, polyval, polyfit, sqrt, stats, randn
#File1
data1 = pd.read_csv('sorted_42650files_from_the_1.7chembl_database.csv',low_memory=False) #read large csv filei
#File2
data2 = pd.read_csv('sorted_60kdat_without_duplicates.csv',low_memory=False)
#File3
data3 = pd.read_csv('headers.csv',low_memory=False)
lines = data3.readlines()
#create my x and y column for linear regression comparison based only on the descritor.csv header name
x = data1.lines.values
y = data2.lines.values
print(x)
print(y)
#plot it
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
plt.plot(x,y, 'o', color='purple')
plt.plot(x,intercept + slope*x, color="black", label='fitted line' + ',' + '$R^2$={:.4f}'.format(r_value))
plt.suptitle('MOE ASA+ Descriptor Correlation Plot', fontsize=14)
plt.xlabel('Ab Initio', fontsize=16)
plt.ylabel('Molecular Mechanics', fontsize=16)
plt.legend(loc=4)
print('$R^2$={:.4f}'.format(r_value))
plt.show()
plt.savefig('ASA+')
plt.close()
file data1
file data2
file data3 (all of the headers I want to correlate between the files)
Thank you for any help you can give me.
I found the answer to my question. It was all about the syntax in the df.values portion of my script. Here is the new script that loops over different headers you want to compare from two files and perform a linear regression on them:
import pandas
import csv
import pandas as pd
import numpy as np
from pandas import DataFrame
import matplotlib.pyplot as plt
import glob
from scipy import linspace, polyval, polyfit, sqrt, stats, randn
input_file = "headers.csv"
headerfile = open(input_file,"r")
fields = headerfile.read().splitlines()
#print(fields)
#File1
data1 = pd.read_csv('sorted_42650files_from_the_1.7chembl_database.csv',low_memory=False) #read large csv filei
#File2
data2 = pd.read_csv('sorted_60kdat_without_duplicates.csv',low_memory=False)
#create my x and y column for linear regression comparison based only on the ASA header
for lines in fields:
#print(lines)
x = data1[lines]
y = data2[lines]
# print(x)
# print(y)
#plot it
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
plt.plot(x,y, 'o', color='purple')
plt.plot(x,intercept + slope*x, color="black", label='fitted line' + ',' + '$R^2$={:.4f}'.format(r_value))
plt.suptitle('MOE ASA+ Descriptor Correlation Plot', fontsize=14)
plt.xlabel('Ab Initio', fontsize=16)
plt.ylabel('Molecular Mechanics', fontsize=16)
plt.legend(loc=4)
print(lines + '$R^2$={:.4f}'.format(r_value))
# plt.show()
plt.savefig(lines)
plt.close()
I want to plot a coloured contour graph with x,y,z from 3 columns of a comma delimited text file, but each time I try the code below, I get ValueError: too many values to unpack (expected 3) error. I would be grateful if that could be resolved.
I would also like to know if there is another (probably better) code for plotting the 3 independent columns.
This is the code:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import scipy.interpolate
N = 100000
long_col, lat_col, Bouguer_col = np.genfromtxt(r'data.txt', unpack=True)
xi = np.linspace(long_col.min(), long_col.max(), N)
yi = np.linspace(lat_col.min(), lat_col.max(), N)
zi = scipy.interpolate.griddata((long_col, lat_col), Bouguer_col, (xi[None,:], yi[:,None]), method='cubic')
fig = plt.figure()
plt.contourf(xi, yi, zi)
plt.xlabel("Long")
plt.ylabel("Lat")
plt.show()
This is the 'data.txt' sample data.
Lat, Long, Elev, ObsGrav, Anomalies
6.671482000000001022e+00,7.372505999999999560e+00,3.612977999999999952e+02,9.780274000000000233e+05,-1.484474523360840976e+02
6.093078000000000216e+00,7.480882000000001142e+00,1.599972999999999956e+02,9.780334000000000233e+05,-1.492942383352201432e+02
6.092045999999999850e+00,7.278669999999999973e+00,1.462445999999999913e+02,9.780663000000000466e+05,-1.190960417173337191e+02
6.402087429999999912e+00,7.393360939999999992e+00,5.237939999999999827e+02,9.780468000000000466e+05,-8.033459449396468699e+01
6.264082730000000154e+00,7.518244540000000420e+00,2.990849999999999795e+02,9.780529000000000233e+05,-1.114865156192099676e+02
6.092975000000000030e+00,7.482914000000000065e+00,1.416474000000000046e+02,9.780338000000000466e+05,-1.525697779102483764e+02
6.383570999999999884e+00,7.289616999999999791e+00,2.590403000000000020e+02,9.780963000000000466e+05,-8.300666170357726514e+01
6.318417000000000172e+00,7.557638000000000744e+00,1.672036999999999978e+02,9.780693000000000466e+05,-1.246774551668204367e+02
6.253779999999999895e+00,7.268805999999999656e+00,1.059429999999999978e+02,9.781026999999999534e+05,-9.986763240839354694e+01
6.384635000000000282e+00,7.291032000000000401e+00,2.615624000000000251e+02,9.780963000000000466e+05,-8.256190758384764194e+01
If the data file looks exactly like in the question you first of all have 5 columns, which you cannot unpack to 3 variables.
Next, you have a header line which you do not want to be part of the data. Also the header line is separated by ,<space>, while the data is separated by ,.
So in total you need
import numpy as np
a,b,c,d,e = np.genfromtxt("data.txt", unpack=True, delimiter=",", skip_header=1)
I have a 1D array called data=[5 1 100 102 3 4 999 1001 5 1 2 150 180 175 898 1012]. I am using python scipy.cluster.vq to find clusters within it. There are 3 clusters in the data. After clustering when I'm trying to plot the data, there is no order in it.
It would be great if it's possible to plot the data in the same order as it is given and color different sections belong to different groups or clusters.
Here is my code:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans, vq
data = np.loadtxt('rawdata.csv', delimiter=' ')
#----------------------kmeans------------------
centroid,_ = kmeans(data, 3)
idx,_ = vq(data, centroid)
x=np.linspace(0,(len(data)-1),len(data))
fig = plt.figure(1)
plt.plot(x,data)
plot1=plt.plot(data[idx==0],'ob')
plot2=plt.plot(data[idx==1],'or')
plot3=plt.plot(data[idx==2],'og')
plt.show()
Here is my plot
http://s29.postimg.org/9gf7noe93/figure_1.png
(The blue graph in the background is in-order, after clustering,it messed up)
Thanks!
Update :
I wrote the following code to implement in-order colored plot after clustering,
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans, vq
data = np.loadtxt('rawdata.csv', delimiter=' ')
#----------------------kmeans-----------------------------
centroid,_ = kmeans(data, 3) # three clusters
idx,_ = vq(data, centroid)
x=np.linspace(0,(len(data)-1),len(data))
fig = plt.figure(1)
plt.plot(x,data)
for i in range(0,(len(data)-1)):
if data[i] in data[idx==0]:
plt.plot(x[i],(data[i]),'ob' )
if data[i] in data[idx==1]:
plt.plot(x[i],(data[i]),'or' )
if data[i] in data[idx==2]:
plt.plot(x[i],(data[i]),'og' )
plt.show()
The problem with the above code is it's too slow. And my array size is over 3million. So this code will take forever to finish it's job for me.
I really appreciate if someone can provide vectorized version of the above mentioned code.
Thanks!
You can plot the clustered data points based on their distances from the cluster center and then write the index of each data point close to that in order to see how they scattered based on their clustering properties:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans, vq
from scipy.spatial.distance import cdist
data=np.array([ 5, 1, 100, 102, 3, 4, 999, 1001, 5, 1, 2, 150, 180, 175, 898, 1012])
centroid,_ = kmeans(data, 3)
idx,_ = vq(data, centroid)
X=data.reshape(len(data),1)
Y=centroid.reshape(len(centroid),1)
D_k = cdist( X, Y, metric='euclidean' )
colors = ['red', 'green', 'blue']
pId=range(0,(len(data)-1))
cIdx = [np.argmin(D) for D in D_k]
dist = [np.min(D) for D in D_k]
r=np.vstack((data,dist)).T
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
mark=['^','o','>']
for i, ((x,y), kls) in enumerate(zip(r, cIdx)):
ax.plot(r[i,0],r[i,1],color=colors[kls],marker=mark[kls])
ax.annotate(str(i), xy=(x,y), xytext=(0.5,0.5), textcoords='offset points',
size=8,color=colors[kls])
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('Data')
ax.set_ylabel('Distance')
plt.show()
Update:
if you are very keen of using vectorize procedure you can do it as following for a randomly generated data:
data=np.random.uniform(1,1000,3000)
#np.vectorize
def plotting(i):
ax.plot(i,data[i],color=colors[cIdx[i]],marker=mark[cIdx[i]])
mark=['>','o','^']
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
plotting(range(len(data)))
ax.set_xlabel('index')
ax.set_ylabel('Data')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.mlab as mlab`
mu = np.loadtxt('my_data/corr.txt')
d = mu[:,2]
y=[]
tot=0
min=999
for i in d:
y.append(float(i))
tot=tot+float(i)
if (min>float(i)):
min=float(i)
av=tot/len(y)
z=[]
m=[]
for i in y:
z.append(i-av)
m.append(i-min)
plt.acorr(z,usevlines=True,maxlags=None,normed=True)
plt.show()
WIth this code I have the output showing a bar chart.
Now,
1) How do I change this plot style to give just the trend line? I cant modify the line properties by any means.
2) How do I write this output data to a dat or txt file?
this should be a working minimal example:
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import normal
data = normal(0, 1, 1000)
# return values are lags, correlation vector and the drawn line
lags, corr, line, rest = plt.acorr(data, marker=None, linestyle='-', color='red', usevlines=False)
plt.show()
np.savetxt("correlations.txt", np.transpose((lags, corr)), header='Lags\tCorrelation')
But i would recommand not to connect the points.