I have a more than 1000 .csv files (data_1.csv......data1000.csv), each containing X and Y values!
x1 y1 x2 y2
5.0 60 5.5 500
6.0 70 6.5 600
7.0 80 7.5 700
8.0 90 8.5 800
9.0 100 9.5 900
I have made a subplot program in python which can give two plots (plot1 - X1vsY1, Plot2 - X2vsY2) at a time using one file.
I need help in looping all the files, (open a file, read it, plot it, pick another file, open it, read it, plot it, ... until all the files in a folder get plotted)
I have the following code:
import pandas as pd
import matplotlib.pyplot as plt
df1=pd.read_csv("data_csv",header=1,sep=',')
fig = plt.figure()
plt.subplot(2, 1, 1)
plt.plot(df1.iloc[:,[1]],df1.iloc[:,[2]])
plt.subplot(2, 1, 2)
plt.plot(df1.iloc[:,[3]],df1.iloc[:,[4]])
plt.show()
How can this be accomplished more efficiently?
You can generate a list of filenames using glob and then plot them in a for loop.
import glob
import pandas as pd
import matplotlib.pyplot as plt
files = glob.glob(# file pattern something like '*.csv')
for file in files:
df1=pd.read_csv(file,header=1,sep=',')
fig = plt.figure()
plt.subplot(2, 1, 1)
plt.plot(df1.iloc[:,[1]],df1.iloc[:,[2]])
plt.subplot(2, 1, 2)
plt.plot(df1.iloc[:,[3]],df1.iloc[:,[4]])
plt.show() # this wil stop the loop until you close the plot
I used NetCDF(.nc) just in case anyone is interested in using NetCDF data. Also, you could replace it with .txt too, the idea is the same. I used this for a contour plot loop.
path_to_folder='#type the path to the files'
count=0
fig = plt.figure(figsize=(10,5))
files = []
for i in os.listdir(path_to_folder):
if i.endswith('.nc'):
count=count+1
files.append(open(i))
data=xr.open_dataset(i)
prec=data['tp']
plt.subplot(1, 2, count) # change 1 and 2 to the shape you want
prec.groupby('time.month').mean(dim=('time','longitude')).T.plot.contourf(cmap='Purples') *#this is to plot contour plot but u can replace with any plot command
print(files)
plt.savefig('try,png',dpi=500,orientation='landscape',format='png')
Here is the basic setup for what am using here at work. This code will plot the data from each file and through each file separately. This will work on any number of files as long as column names remain the same. Just direct it to the proper folder.
import os
import csv
def graphWriterIRIandRut():
m = 0
List1 = []
List2 = []
List3 = []
List4 = []
fileList = []
for file in os.listdir(os.getcwd()):
fileList.append(file)
while m < len(fileList):
for col in csv.DictReader(open(fileList[m],'rU')):
List1.append(col['Col 1 Name'])
List2.append(col['Col 2 Name'])
List3.append(col['Col 3 Name'])
List4.append(col['Col 4 Name'])
plt.subplot(2, 1, 1)
plt.grid(True)
colors = np.random.rand(n)
plt.plot(List1,List2,c=colors)
plt.tick_params(axis='both', which='major', labelsize=8)
plt.subplot(2, 1, 2)
plt.grid(True)
colors = np.random.rand(n)
plt.plot(List1,List3,c=colors)
plt.tick_params(axis='both', which='major', labelsize=8)
m = m + 1
continue
plt.show()
plt.gcf().clear()
plt.close('all')
# plotting all the file data and saving the plots
import os
import csv
import matplotlib.pyplot as plt
def graphWriterIRIandRut():
m = 0
List1 = []
List2 = []
List3 = []
List4 = []
fileList = []
for file in os.listdir(os.getcwd()):
fileList.append(file)
while m < len(fileList):
for col in csv.DictReader(open(fileList[m],'rU')):
List1.append(col['x1'])
List2.append(col['y1'])
List3.append(col['x2'])
List4.append(col['y2'])
plt.subplot(2, 1, 1)
plt.grid(True)
# colors = np.random.rand(2)
plt.plot(List1,List2,c=colors)
plt.tick_params(axis='both', which='major', labelsize=8)
plt.subplot(2, 1, 2)
plt.grid(True)
# colors = np.random.rand(2)
plt.plot(List1,List3,c=colors)
plt.tick_params(axis='both', which='major', labelsize=8)
m = m + 1
continue
plt.show()
plt.gcf().clear()
plt.close('all')
What we want to do is for each iteration, or file, create a new empty list. So for each iteration the data will be plotted, but once that data has been plotted a new empty list will be created, and plotted. Once all the data from each file has been plotted, then you want to finally to plt.show() which will show all the plots together. Here is a link to a similar problem I was having: Traceback lines on plot of multiple files. Goog luck!
import csv
import matplotlib.pyplot as plt
def graphWriter():
for file in os.listdir(os.getcwd()):
List1 = []
List2 = []
List3 = []
List4 = []
with open(filename, 'r') as file:
for col in csv.DictReader(file):
List1.append(col['x1'])
List2.append(col['y1'])
List3.append(col['x2'])
List4.append(col['y2'])
plt.subplot(2, 1, 1)
plt.grid(True)
colors = np.random.rand(2)
plt.plot(List1,List2,c=colors)
plt.tick_params(axis='both', which='major', labelsize=8)
plt.subplot(2, 1, 2)
plt.grid(True)
colors = np.random.rand(2)
plt.plot(List1,List3,c=colors)
plt.tick_params(axis='both', which='major', labelsize=8)
plt.show()
plt.gcf().clear()
plt.close('all')
If for some reason #Neill Herbst answer didnt work as expected (i consider the easiest way) I run with a problem reading the files I rearrenged the code that worked for me
import glob
import pandas as pd
import matplotlib.pyplot as plt
os.chdir(r'path')
for file in glob.glob("*.csv")::
df1=pd.read_csv(file,header=1,sep=',')
fig = plt.figure()
plt.subplot(2, 1, 1)
plt.plot(df1.iloc[:,[1]],df1.iloc[:,[2]])
plt.subplot(2, 1, 2)
plt.plot(df1.iloc[:,[3]],df1.iloc[:,[4]])
plt.show() # plot one csv when you close it, plots next one
#plt.show <------ if u want to see all the plots in different windows
Using p = Path(...): p → WindowsPath('so_data/files')
files = p.rglob(...) yields all files matching the pattern
file[0] → WindowsPath('so_data/files/data_1.csv')
p.parent / 'plots' / f'{file.stem}.png' → WindowsPath('so_data/plots/data_1.png')
p.parent → WindowsPath('so_data')
file.stem → data_1
This assumes all directories exist. Directory creation / checking is not included.
This example uses pandas, as does the OP.
Plotted with pandas.DataFrame.plot, which uses matplotlib as the default backend.
Use .iloc to specify the columns, and then x=0 will always be the x-axis data, based on the given example data.
Tested in python 3.8.11, pandas 1.3.2, matplotlib 3.4.3
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
p = Path('so_data/files') # specify the path to the files
files = p.rglob('data_*.csv') # generator for all files based on rglob pattern
for file in files:
df = pd.read_csv(file, header=0, sep=',') # specify header row and separator as needed
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(7, 5))
df.iloc[:, [0, 1]].plot(x=0, ax=ax1) # plot 1st x/y pair; assumes x data is at position 0
df.iloc[:, [2, 3]].plot(x=0, ax=ax2) # plot 2nd x/y pair; assumes x data is at position 0
fig.savefig(p.parent / 'plots' / f'{file.stem}.png')
plt.close(fig) # close each figure, otherwise they stay in memory
Sample Data
This is for testing the plotting code
Create a so_data/files directory manually.
df = pd.DataFrame({'x1': [5.0, 6.0, 7.0, 8.0, 9.0], 'y1': [60, 70, 80, 90, 100], 'x2': [5.5, 6.5, 7.5, 8.5, 9.5], 'y2': [500, 600, 700, 800, 900]})
for x in range(1, 1001):
df.to_csv(f'so_data/files/data_{x}.csv', index=False)
Alternate Answer
This answer addresses cases where there are many consecutive pairs of x/y columns
df.column creates an array of columns, that can be chunked into pairs
For consecutive column pairs, this answer works
list(zip(*[iter(df.columns)]*2)) → [('x1', 'y1'), ('x2', 'y2')]
If necessary, use some other pattern to create pairs of columns
Use .loc, since there will be column names, instead of .iloc for column indices.
p = Path('so_data/files')
files = p.rglob('data_*.csv')
for file in files:
df = pd.read_csv(file, header=0, sep=',')
col_pair = list(zip(*[iter(df.columns)]*2)) # extract column pairs
fig, axes = plt.subplots(len(col_pair), 1) # a number of subplots based on number of col_pairs
axes = axes.ravel() # flatten the axes if necessary
for cols, ax in zip(col_pair, axes):
df.loc[:, cols].plot(x=0, ax=ax) # assumes x data is at position 0
fig.savefig(p.parent / 'plots' / f'{file.stem}.png')
plt.close(fig)
Related
Example Input Data:
I am a beginner in python. I use for loop to read several csv files look like above(all of those file are same format).
so far my code was look like below.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
ax, fig = plt.subplots(4,4,sharex=False, sharey=False, figsize=(22, 10), dpi=70, linewidth=0.5)
ax = np.array(ax)
def loop_directory(directory):
for filename in os.listdir(directory):
if filename.endswith(".csv"):
file_directory = os.path.join(directory, filename)
# print(filename)
df = pd.read_csv(file_directory)
df = df[df['Tavg'].isin([-999]) == False]
df[['Year','Month']] = df[['Year','Month']].astype(int).astype(str)
df["Year&Month"] = df[['Year', 'Month']].agg("/".join,axis=1)
df["Year&Month"] = pd.to_datetime(df["Year&Month"])
x = df["Year&Month"]
y = df["Tavg"]
for axes,col in zip(x, y):
axes.plot(df.index, df[col]) # here is the problem, i dont know how to use for loop to plot in subplots
plt.show()
if __name__ == "__main__":
loop_directory(r"C:\Users\LAB312\Desktop\vietnam\anomaly")
I've tried for ten more times but didn't work at all.
I want to know how to use those syntaxes ex. ax zip ,etc.
enter image description here
I want to plot in every subplot in one plot.
it should have plot every ax.
Firstly, you have your fig and ax reversed in your call to plt.subplots, it should be:
fig, ax = plt.subplots(4,4,sharex=False, sharey=False, figsize=(22, 10), dpi=70, linewidth=0.5)
You can then access each set of axes to call plot by indexing. You can index the 4 by 4 numpy array to get each axes set in your 4 by 4 grid of plots. i.e. ax[0, 0].plot(...), ax[0, 1].plot(...), etc. up to ax[3, 3].plot(...)
Your question needs a bit more information to clarify how you want the data plotted though! I can see you combine the first two columns so that you have 4 columns, but consider how do you want each sample to be plotted.
EDIT: As you want to plot your files sequentially in ax[0, 0], ax[0, 1], etc., you can flatten the 2D numpy array of axes to get a 1D iterable that you can loop through or index with one value. I don't have your files so I can't test it but here's some demo code that should give you an idea of what to do.
As #sam mentioned in the comments, you should seperate your csv collection logic and your plotting logic.
def loop_directory(directory):
# Get all files, filter for '.csv' and prepend dir path
files = os.listdir(directory)
csvs = [os.path.join(directory, f) for f in files if f.endswith('.csv')]
return csvs
def plot_csvs(csvs):
fig, ax = plt.subplots(4, 4, sharex=False, sharey=False, figsize=(22, 10), dpi=70, linewidth=0.5)
ax = np.array(ax).flatten() # Flatten to 1D, [0 ,0], [0, 1], etc
# This assumes number of subplots >= number of CSVs
for i, filename in enumerate(csvs):
df = pd.read_csv(filename)
# Do your processing here
x = df["Year&Month"]
y = df["Tavg"]
ax[i].plot(x, y)
plt.show()
csv_dir = '/path/to/csv/dir'
csv_paths = loop_directory(csv_dir)
plot_csvs(csv_paths)
I'm trying to create scatter plot from several txt files. All files have the same structure: two columns with data and 'comma' as a separator:
54.1,12
65.7,11
122.2,18
etc
For small number of files i have this code:
import numpy as np
import matplotlib.pyplot as plt
import csv
# Create data
g1=np.loadtxt('214.txt',delimiter=',', unpack=True)
g2=np.loadtxt('228.txt',delimiter=',', unpack=True)
g3=np.loadtxt('491.txt',delimiter=',', unpack=True)
g4=np.loadtxt('647.txt',delimiter=',', unpack=True)
data = (g1, g2, g3,g4)
colors = ("red", "green", "blue", "black")
groups = ("214", "228", "491", "647")
# Create plot
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for data, color, group in zip(data, colors, groups):
y, x = data
ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)
#Plot settings
plt.title('Matplot scatter plot')
plt.legend(loc=4)
axes = plt.gca()
axes.set_xlim([2,30])
axes.set_ylim([0,3000])
plt.gca().invert_yaxis()
plt.show()
Please advise how to modify it to read multiple (up to 50 - 100) txt files in folder, if number of them is different every time ?
I would search for all files in your current directory and identify which you want to extract data from. This can be done with something like:
from os import listdir, path
files = [f for f in listdir('.') if path.isfile(f)]
file_names = [file for file in files if file.startswith('file_name_identifer')]
This will give you a list of file names which contain the data you're wanting to extract, you can then just load them one by one in a for loop. Using similar loading techniques to what you've used above:
data = []
for file in file_names:
data.append(np.loadtxt('file', delimiter=',', unpack=True))
You could flatten this to a generator expression too:
data = [np.loadtxt('file', delimiter=',', unpack=True) for file in file_names]
If your files don't start with something which can be used to identify them, you can simply check some other way instead (change if file.startswith('file_name_indentifer') to something else which maybe checks if they're .txt files for instance: if file.endswith('.txt')).
You can get a list of all files in directory using method described in this post
And then do something like this:
data = []
for file in filenames:
data.append(np.loadtxt(file, delimiter=‘,’, unpack = True
#And do everything else you did with data
Though if your dataset is larger then available space in system memory I would consider adding datapoints to plot as you read the files
data = []
colors = [“red”,”green”,”blue”,”balck”]
for i, file in enumerate(filenames):
data = np.loadtxt(file, delimiter=‘,’,unpack=True)
group = file.split(‘.’)[0]
color = colors[i%len(colors)]
ax.scatter(data[0], data[1], alpha=0.8, c=color, edgecolors=‘none’, s=30, label=group)
P.S. quotes are typed wrong (both double and single ones) as I’m writing from a mobile device
Thanks for help. Here is what worked for me:
import numpy as np
import matplotlib.pyplot as plt
from os import listdir, path
import logging, sys
import random
data = []
#Get files with extension ".txt")
files = [f for f in listdir('.') if path.isfile(f)]
file_names = [file for file in files if file.endswith('.txt')]
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
# Create plot
for file in file_names:
data=np.loadtxt(file, delimiter=",", unpack = True)
color = ["#"+''.join([random.choice('0123456789ABCDEF')for j in range(6)])]
ax.scatter(data[1], data[0], alpha=0.8, c=color, edgecolors="none", s=30, label=file)
#Plot settings
plt.title('Matplot scatter plot')
plt.legend(loc=4)
axes = plt.gca()
plt.gca().invert_yaxis()
plt.show()
I want to draw multiple ternary graphs and thought to do this using matplotlib's subplot.
I'm just getting empty 'regular' plots though, not the ternary graphs I want in there. I found the usage of
figure, ax = plt.subplots()
tax = ternary.TernaryAxesSubplot(ax=ax)
so this seems to be possible, but can't really find out how to get this working. Any ideas?
Code I'm using:
I'm using a for loop as the data has columns named tria1-a, tria2-a, etc for the different triads
import ternary
import matplotlib.pyplot as plt
import pandas as pd
#configure file to import.
filename = 'somecsv.csv'
filelocation = 'location'
dfTriad = pd.read_csv(filelocation+filename)
# plot the data
scale = 33
figure, ax = plt.subplots()
tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
figure.set_size_inches(10, 10)
tax.set_title("Scatter Plot", fontsize=20)
tax.boundary(linewidth=2.0)
tax.gridlines(multiple=1, color="blue")
tax.legend()
tax.ticks(axis='lbr', linewidth=1, multiple=5)
tax.clear_matplotlib_ticks()
#extract the xyz columns for the triads from the full dataset
for i in range(1,6) :
key_x = 'tria'+ str(i) + '-a'
key_y = 'tria' + str(i) + '-b'
key_z = 'tria' + str(i) + '-c'
#construct dataframe from the extracted xyz columns
dfTriad_data = pd.DataFrame(dfTriad[key_x], columns=['X'])
dfTriad_data['Y'] = dfTriad[key_y]
dfTriad_data['Z'] = dfTriad[key_z]
#create list of tuples from the constructed dataframe
triad_data = [tuple(x) for x in dfTriad_data.to_records(index=False)]
plt.subplot(2, 3, i)
tax.scatter(triad_data, marker='D', color='green', label="")
tax.show()
I had the same problem and could solve it by first "going" into the subplot, then creating the ternary figure in there by giving plt.gca() as keyword argument ax:
plt.subplot(2,2,4, frameon = False)
scale = 10
plt.gca().get_xaxis().set_visible(False)
plt.gca().get_yaxis().set_visible(False)
figure, tax = ternary.figure(ax = plt.gca(), scale = scale)
#now you can use ternary normally:
tax.line(scale * np.array((0.5,0.5,0.0)), scale*np.array((0.0, 0.5, 0.5)))
tax.boundary(linewidth=1.0)
#...
I have multiple graphs to compare to each other but it is very time consuming to compare the graphs then change the code to compare the next features. Thats why i decided to use buttons to show and hide the plots I need.
The graphs are computed in a for-loop because of the uncertain number of plots to show simultaniously. This is done in a fashion showed at the bottom.
The problem is that i can't access the plots inside the for-loop to switch them on.So I need three buttons total, one for every function.
for f in flist:
ax1.plot(f['x'], f['fx'], label=f['fname'] )
ax1.legend()
In the part above I generate the plots and in the following part I have to access the plots and labelnames
lines = [ax1]
But that doesn't work like I thougt. Do you have any suggestions how I could solve the problem?
I've got the code from a matplotlib example (See Check Buttons example). Shown below is a code example the way I want to use it and doesn't work.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.widgets import CheckButtons
flist = []
xl = [np.linspace(0,8,265)]
for xi in xl:
flist.append({'x': xi, 'fx': -0.4*(xi-4)**2+3,'fname':'-0.4*(xi-4)**2+3'})
flist.append({'x': xi, 'fx': -0.4 * (xi - 5) ** 2 + 3, 'fname': ' -0.4*(xi-5)**2+3'})
flist.append({'x': xi, 'fx': -0.4 * (xi - 3.5) ** 2 + 3, 'fname': '-0.4*(xi-3.5)**2+3'})
fig = plt.figure(figsize=(12, 6))
gs = GridSpec(2, 2, width_ratios=[1, 2.5])
ax1 = plt.subplot(gs[:, :-1])
for f in flist:
ax1.plot(f['x'], f['fx'], label=f['fname'] )
ax1.legend()
ax1.set_xlabel("time")
ax1.set_ylabel("amplitude")
ax1.set_title('graphs')
lines = [ax1]
# Make checkbuttons with all plotted lines with correct visibility
rax = plt.axes([0.05, 0.4, 0.1, 0.15])
labels = [str(line.get_label()) for line in lines]
visibility = [line.get_visible() for line in lines]
check = CheckButtons(rax, labels, visibility)
def func(label):
index = labels.index(label)
lines[index].set_visible(not lines[index].get_visible())
plt.draw()
check.on_clicked(func)
plt.tight_layout()
plt.show()
lines should be a list of lines you want to toggle visibility for. Hence you would want to fill this list with the lines in the for loop.
lines = []
for f in flist:
line, = ax1.plot(... )
lines.append(line)
I have two folders with similar number of files: maindirNo and maindirWith. I'm trying to plot each pair of similar files from folders on one plot:
for i in [maindirNo, maindirWith]:
for root, dirs, files in os.walk(i):
for fil in files:
if 'output.rsv' in fil:
df = pd.read_csv(os.path.join(i, fil), skiprows = 9, delimiter = r'\s+', header = None)
df['SIMULATEDm'] = mergedlevels
df['OBSERVEDm'] = df_observed['OBSERVEDm']
df['date'] = pd.date_range('1/1991','12/2040', freq='MS')
if i == maindirNo:
plt.plot(df['date'], df['SIMULATEDm'], 'b', label='No outlet')
if i == maindirWith:
plt.plot(df['date'], df['SIMULATEDm'], 'r', label='With outlet')
plt.legend(loc = 'lower right')
plt.savefig('C:/Users/sgulbin/Desktop/AGU_Conf/plots/%s.jpg' %fil)
plt.close()
The problem is that I either have all datesets plotted on one plot, or one plot for each file (I need two datasets on one plot). I assume I can append output to an empty dataframe and then plot it, but is there a simplest way to plot them through the loop?
P.S. I know there are kind of similar questions to this, but not exactly.
pandas uses matplotlib which gives fig and ax when you create many plots. ie. 5 plots in one column
fig, ax = plt.subplots(5, 1)
and then you can use ax[0], a[1] to choose plot for drawed line.
import matplotlib.pyplot as plt
import pandas as pd
import random
SIZE = 5
# create grid 5x1
fig, ax = plt.subplots(SIZE, 1)
# --- first folder --- blue ---
for idx in range(SIZE):
# dataframe with random data as example
df = pd.DataFrame([ random.randint(0,10) for _ in range(10) ])
# draw it
ax[idx].plot(df, 'b')
# --- second folder --- red ---
for idx in range(SIZE):
# dataframe with random data as example
df = pd.DataFrame([ random.randint(0,10) for _ in range(10) ])
# draw it
ax[idx].plot(df, 'r')
plt.show()