Loop through many files in 2 directories - python

Given: Folder 1 with A.txt and B.txt and Folder 2 with A.txt. and B.txt
How would I be able to run them concurrently such as file A.txt from folder 1 should run with file from folder 2 A.txt and so on.
What I have so far loops through all of the second folders files and then loops through the first folders files, which throws it out of order. Some stuff will be done such as merging parts of the files together (which has been done so far).
My main question is how would I be able to run through 2 directories simultaneously and do stuff inside them.
Note there are many files in Folder 1 and Folder 2 so I need to find a way that utilizes directory schema of some sort
patha=/folder1
pathb=/folder2
import os,glob
for filename in glob.glob(os.path.join(patha,'*.txt'):
for filenamez in glob.glob(os.path.join(pathb,'*.txt'):
MY FUNCTION THAT DOES OTHER STUFF

You can open files with the same name in both folders simultaneously using context managers and do whatever needs to be done from both input streams:
import os
my_folders = ['Folder1', 'Folder2']
common_files = set(os.listdir('Folder1')) & set(os.listdir('Folder2'))
non_common_files = set(os.listdir('Folder1')) ^ set(os.listdir('Folder2'))
print(f'common_files" {common_files}')
print(f'files without matches: {non_common_files}')
for f_name in common_files:
with open(os.path.join(my_folders[0], f_name)) as src_1:
with open(os.path.join(my_folders[1], f_name)) as src_2:
# do the stuff on both sources... for instance print first line of each:
print(f'first line of src_1: {src_1.readline()}')
print(f'first line of src_2: {src_2.readline()}')
Output
common_files" {'A.txt'}
files without matches: set()
first line of src_1: some txt
first line of src_2: text in folder 2's A

Is zip what you're looking for?
import glob
import os
files_a = glob.glob(os.path.join(path_a, "*.txt")
files_b = glob.glob(os.path.join(path_b, "*.txt")
for file_a, file_b in zip(files_a, files_b):
pass

You could maybe do something like this:
from threading import Thread
import os,glob
def dir_iterate(path: str):
for filename in glob.glob(os.path.join(path,'*.txt'):
# Other stuff ..
path1 = "./directory1"
path2 = "./directory2"
Thread(target = dir_iterate, args=(path1,)).start()
Thread(target = dir_iterate, args=(path2,)).start()

This should work,
import glob
import os
files_a = sorted(glob.glob(os.path.join(path_a, "*.txt")))
files_b = sorted(glob.glob(os.path.join(path_b, "*.txt")))
for file_a, file_b in zip(files_a, files_b):
# Add code to concat

Related

Zipping a list of folders with Python

everyone. I'm really new to Python, so I need some help here.
I have a list of folders names inside a .CSV file. All these folders are inside the same path.
I need to zip them individually (each one needs to become a .ZIP file, maintaining its own original name) and, after zipping, delete the original folders.
Tried some things here, but had no success :(
I read about zipfiles, os.walk, import csv, but I can't get these things together.
Can someone help me with this one?
The code is here. I'm really sorry, it probably makes no sense as it is. I'm really a begginer :(
import os
import zipfile
import csv
import sys
os.chdir('dir')
files='dir'
for i in range (len(files)):
with zipfile.ZipFile(files(i)+'.zip', 'w') as zipMe:
zipMe.write(files[i], compress_type=zipfile.ZIP_DEFLATED)
The read_csv() function is called to open the CSV file and read the directory names from the CSV into a list named dir_names.
The read_csv() function calls the zip_dirs() function and passes the dir_names list containing the directory names. The zip_dirs() function zips each directory into a zip file and saves it.
The zip_dirs() function calls the delete_dirs() function and it deletes the original directories.
import os
import csv
import zipfile
dir_names = []
zip_out = 'dirs.zip'
dir_csv = 'dir_names.csv'
parent_dir = 'dir-to-zip/'
# delete directories
def delete_dirs(dir_names):
for dir in dir_names:
os.rmdir(parent_dir + dir)
# zip directories
def zip_dirs(dir_names):
for dir in dir_names:
zip_process = zipfile.ZipFile(dir + '.zip', "w", zipfile.ZIP_DEFLATED)
zip_process.write(parent_dir + dir)
zip_process.close()
delete_dirs(dir_names)
# read the directory names from csv file
def read_csv():
with open(dir_csv, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for row in csv_reader:
dir_names.append(row[0])
zip_dirs(dir_names)
read_csv()
CSV Input:
| ------|
| dir1 |
| dir2 |
| dir3 |
| dir4 |
Directory Structure:
./dir-to-zip
/dir1
/dir2
/dir3
/dir4
Using the shutil library in python
#importing libraries
import pandas as pd
import os
import shutil
### *****below directories are simply examples; please set your own directories*****
folder_directory = 'D:\\Zip test' #setting directory where original folders are present
os.chdir(folder_directory) #changing current directory so that zip files are formed in the original directory
df = pd.read_csv(r'C:\Users\sahaa\Downloads\folder_names.csv') # reading the csv file with folder names
fol_nam = df['Folder Names'].tolist() # creating a list of the folders to be zipped which are present under the column 'Folder Names' in the CSV.
for i in fol_nam: # for loop
shutil.make_archive(i, 'zip', 'D:\\Zip test' + '\\' + i) # zipping each folder and keeping the nomenclature same
shutil.rmtree('D:\\Zip test' + '\\' + i) # deleting original folder after zipping
Try this
└───main_dir
├───dir01
├───dir02
└───dir03
....
import shutil
import os
def compress_directory(source_dir, output_filename, kind='zip'):
shutil.make_archive(output_filename, kind, source_dir)
main_dir = r'S:\repo\tools\main_dir'
# list sub folders
all_dirs = [os.path.join(*[main_dir, sub_dir]) for sub_dir in os.listdir(main_dir)]
# zip sub dirs
for sub_dir in all_dirs:
compress_directory(sub_dir, sub_dir)
# remove sub directory
shutil.rmtree(sub_dir)
# zip main dir
compress_directory(main_dir, main_dir)
# remove main directory
shutil.rmtree(main_dir)
...
└───main_dir.zip
├───dir01.zip
├───dir02.zip
└───dir03.zip

Python: Loop to open multiple folders and files in python

I am new to python and currently work on data analysis.
I am trying to open multiple folders in a loop and read all files in folders.
Ex. working directory contains 10 folders needed to open and each folder contains 10 files.
My code for open each folder with .txt file;
file_open = glob.glob("home/....../folder1/*.txt")
I want to open folder 1 and read all files, then go to folder 2 and read all files... until folder 10 and read all files.
Can anyone help me how to write loop to open folder, included library needed to be used?
I have my background in R, for example, in R I could write loop to open folders and files use code below.
folder_open <- dir("......./main/")
for (n in 1 to length of (folder_open)){
file_open <-dir(paste0("......./main/",folder_open[n]))
for (k in 1 to length of (file_open){
file_open<-readLines(paste0("...../main/",folder_open[n],"/",file_open[k]))
//Finally I can read all folders and files.
}
}
This recursive method will scan all directories within a given directory and then print the names of the txt files. I kindly invite you to take it forward.
import os
def scan_folder(parent):
# iterate over all the files in directory 'parent'
for file_name in os.listdir(parent):
if file_name.endswith(".txt"):
# if it's a txt file, print its name (or do whatever you want)
print(file_name)
else:
current_path = "".join((parent, "/", file_name))
if os.path.isdir(current_path):
# if we're checking a sub-directory, recursively call this method
scan_folder(current_path)
scan_folder("/example/path") # Insert parent direcotry's path
Given the following folder/file tree:
C:.
├───folder1
│ file1.txt
│ file2.txt
│ file3.csv
│
└───folder2
file4.txt
file5.txt
file6.csv
The following code will recursively locate all .txt files in the tree:
import os
import fnmatch
for path,dirs,files in os.walk('.'):
for file in files:
if fnmatch.fnmatch(file,'*.txt'):
fullname = os.path.join(path,file)
print(fullname)
Output:
.\folder1\file1.txt
.\folder1\file2.txt
.\folder2\file4.txt
.\folder2\file5.txt
Your glob() pattern is almost correct. Try one of these:
file_open = glob.glob("home/....../*/*.txt")
file_open = glob.glob("home/....../folder*/*.txt")
The first one will examine all of the text files in any first-level subdirectory of home/......, whatever that is. The second will limit itself to subdirectories named like "folder1", "folder2", etc.
I don't speak R, but this might translate your code:
for filename in glob.glob("......../main/*/*.txt"):
with open(filename) as file_handle:
for line in file_handle:
# perform data on each line of text
I think nice way to do that would be to use os.walk. That will generate tree and you can then iterate through that tree.
import os
directory = './'
for d in os.walk(directory):
print(d)
This code will look for all directories inside a directory, printing out the names of all files found there:
#--------*---------*---------*---------*---------*---------*---------*---------*
# Desc: print filenames one level down from starting folder
#--------*---------*---------*---------*---------*---------*---------*---------*
import os, fnmatch, sys
def find_dirs(directory, pattern):
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)):
if fnmatch.fnmatch(item, pattern):
filename = os.path.join(directory, item)
yield filename
def find_files(directory, pattern):
for item in os.listdir(directory):
if os.path.isfile(os.path.join(directory, item)):
if fnmatch.fnmatch(item, pattern):
filename = os.path.join(directory, item)
yield filename
#--------*---------*---------*---------*---------*---------*---------*---------#
while True:# M A I N L I N E #
#--------*---------*---------*---------*---------*---------*---------*---------#
# # Set directory
os.chdir("C:\\Users\\Mike\\\Desktop")
for filedir in find_dirs('.', '*'):
print ('Got directory:', filedir)
for filename in find_files(filedir, '*'):
print (filename)
sys.exit() # END PROGRAM
pathlib is a good choose
from pathlib import Path
# or use: glob('**/*.txt')
for txt_path in [_ for _ in Path('demo/test_dir').rglob('*.txt') if _.is_file()]:
print(txt_path.absolute())

How to write lists in files with python

My data is organized as such:
I have 30 folders. In each of them, 3 subfolders. In each of them, one file.
I would like to write a script that writes, in a text file 1 located in folder 1, the paths to the files located in the subfolders of this folder 1; and so on for every other folder.
The problem is that the script only writes, in each text file, the 3rd file (file in subfolder 3) rather than the files in subfolders 1, 2, 3.
This is what I tried:
import glob
import os
gotofolders = '/path/to/folderslocation/'
foldersname = open('/path/to/foldersname.txt').read().split()
for folders in foldersname:
foldersdirectory = os.path.join(gotofolders,foldersname)
filepaths = glob.glob(os.path.join(foldersdirectory)+'*subfolders/*files')
for filepath in filepaths:
savethepaths = os.path.join(foldersdirectory)+'files_path_in_that_folder.txt'
with open (savethepaths,'w') as f:
f.write(filepath+'\n')
As said, it almost works, excepts that in each 'files_path_in_that_folder.txt' I have the 3rd element of the "filepath" list, rather than all 3 elements.
Thanks!
Okay, I figured it out; I had to add:
with open (savethepaths,'w') as f:
f.writelines(list("%s\n" %filepath for filepath in filepaths))
import os
def directory_into_file(_path, file_obj, depth):
# depth is a string of asterisk, just for better printing. starts with empty string
file_obj.write(depth + _path + '\n')
if(os.path.isdir(_path)):
file_list = os.listdir(_path)
os.chdir(_path)
for file in file_list:
directory_into_file(file, file_obj, depth+'*')
os.chdir("..")
this should work.
_path - the path of the directory,
file_obj - send the object file to the function and first,
depth - at first call send an empty string
hope this would work. didn't try it myself...

Reading in multiple files in directory using python

I'm trying to open each file from a directory and print the contents, so I have a code as such:
import os, sys
def printFiles(dir):
os.chdir(dir)
for f in os.listdir(dir):
myFile = open(f,'r')
lines = myFile.read()
print lines
myFile.close()
printFiles(sys.argv[1])
The program runs, but the problem here is that it is only printing one of the contents of the file, probably the last file that it has read. Does this have something to do with the open() function?
Edit: added last line that takes in sys.argv. That's the whole code, and it still only prints the last file.
There is problem with directory and file paths.
Option 1 - chdir:
def printFiles(dir):
os.chdir(dir)
for f in os.listdir('.'):
myFile = open(f,'r')
# ...
Option 2 - computing full path:
def printFiles(dir):
# no chdir here
for f in os.listdir(dir):
myFile = open(os.path.join(dir, f), 'r')
# ...
But you are combining both options - that's wrong.
This is why I prefer pathlib.Path - it's much simpler:
from pathlib import Path
def printFiles(dir):
dir = Path(dir)
for f in dir.iterdir():
myFile = f.open()
# ...
The code itself certainly should print the contents of every file.
However, if you supply a local path and not a global path it will not work.
For example, imagine you have the following folder structure:
./a
./a/x.txt
./a/y.txt
./a/a
./a/a/x.txt
If you now run
printFiles('a')
you will only get the contents of x.txt, because os.listdir will be executed from within a, and will list the contents of the internal a/a folder, which only has x.txt.

Python: Looping through files in a different directory and scanning data

I am having a hard time looping through files in a directory that is different from the directory where the script was written. I also ideally would want my script through go to through all files that start with sasa. There are a couple of files in the folder such as sasa.1, sasa.2 etc... as well as other files such as doc1.pdf, doc2.pdf
I use Python Version 2.7 with windows Powershell
Locations of Everything
1) Python Script Location ex: C:Users\user\python_project
2) Main_Directory ex: C:Users\user\Desktop\Data
3) Current_Working_Directory ex: C:Users\user\python_project
Main directory contains 100 folders (folder A, B, C, D etc..)
Each of these folders contains many files including the sasa files of interest.
Attempts at running script
For 1 file the following works:
Script is run the following way: python script1.py
file_path = 'C:Users\user\Desktop\Data\A\sasa.1
def writing_function(file_path):
with open(file_path) as file_object:
lines = file_object.readlines()
for line in lines:
print(lines)
writing_function(file_path)
However, the following does not work
Script is run the following way: python script1.py A sasa.1
import os
import sys
from os.path import join
dr = sys.argv[1]
file_name = sys.argv[2]
file_path = 'C:Users\user\Desktop\Data'
new_file_path = os.path.join(file_path, dr)
new_file_path2 = os.path.join(new_file_path, file_name)
def writing_function(paths):
with open(paths) as file_object:
lines = file_object.readlines()
for line in lines:
print(line)
writing_function(new_file_path2)
I get the following error:
with open(paths) as file_object:
IO Error: [Errno 2] No such file or directory:
'C:Users\\user\\Desktop\\A\\sasa.1'
Please note right now I am just working on one file, I want to be able to loop through all of the sasa files in the folder.
It can be something in the line of:
import os
from os.path import join
def function_exec(file):
code to execute on each file
for root, dirs, files in os.walk('path/to/your/files'): # from your argv[1]
for f in files:
filename = join(root, f)
function_exec(filename)
Avoid using the variable dir. it is a python keyword. Try print(dir(os))
dir_ = argv[1] # is preferable
No one mentioned glob so far, so:
https://docs.python.org/3/library/glob.html
I think you can solve your problem using its ** magic:
If recursive is true, the pattern “**” will match any files and zero
or more directories and subdirectories. If the pattern is followed by
an os.sep, only directories and subdirectories match.
Also note you can change directory location using
os.chdir(path)

Categories