Python - Iterating over all text files recursively

Python - Iterating over all text files recursively - python

I am creating a text parser with python 3.6. I have a file layout like below:
(The real file structure I will be using is much more extensive than this.)
-Directory(main folder)
-amerigroup.txt
-bcbs.txt
childfolder
-medicare.txt
I need to extract text into 2 different lists (going through and appending to my ever-growing lists). Whenever I run my current code, I can't seem to get my program to open up my medicare.txt file to read and extract the information. I get an error stating that there is no such file or directory: 'medicare.txt'.
My goal is to get the data from the 3 files and extract it in one go. How do I get the amerigroup and bcbs data then go into the childfolder and get medicare.txt, then repeat that for all branches of my file path?
I am simply trying to open and close my text files in this code snippet. Here's what I have so far:
import re
import os
import pandas as pd
#change active directory
os.chdir(r'\\company\Files\HomeDrive\user\My Documents\claimstest')
#rootdir = r'\\company\Files\HomeDrive\user\My Documents\claimstest'
#set up Regular Expression objects to parse X12
claimidRegex = re.compile(r'(CLM\*)(\d+)')
dxRegex = re.compile(r'(ABK:)(\w\d+)(\*|~)(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?')
claimids = []
dxinfo = []
for dirpath, dirnames, files in os.walk(topdir):
for name in files:
cid = []
dx = []
if name.lower().endswith(exten):
data = open(name, 'r')
data.close()
Thank you so much for taking your time to assist me on this!
edit: I have tried using walk to no avail so far. My most recent attempt (I tried using txtfile_full_path as well--did not work):
for dirpath, dirnames, filename in os.walk(base_dir):
for filename in filename:
#defining file type
txtfile=open(filename,"r")
txtfile_full_path = os.path.join(dirpath, filename)
print(filename)
edit2 for anyone interested. This was my final solution to the problem:
import re
import os
import pandas as pd
#change active directory
os.chdir(r'\\company\Files\HomeDrive\user\My Documents\claimstest')
base_dir = (r'\\company\Files\HomeDrive\user\My Documents\claimstest')
#set up Regular Expression objects to parse X12
claimidRegex = re.compile(r'(CLM\*)(\d+)')
dxRegex = re.compile(r'(ABK:)(\w\d+)(\*|~)(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?(ABF:)?(\w\d+)?(\*|~)?')
claimids = []
dxinfo = []
for dirpath, dirnames, filename in os.walk(base_dir):
for filename in filename:
txtfile_full_path = os.path.join(dirpath, filename)
x12 = open(txtfile_full_path, 'r')
for i in x12:
match = claimidRegex.findall(i)
for word in match:
claimids.append(word[1])
x12.seek(0)
for i in x12:
match = dxRegex.findall(i)
for word in match:
dxinfo.append(word)
x12.close()
datadic = dict(zip(claimids, dxinfo))

You need to pass the full path to open. Just creating a string variable somewhere won't do anything for you! So the following should avoid your error:
txt_list = []
for dirpath, dirnames, filename in os.walk(base_dir):
for filename in filename:
# create full path
txtfile_full_path = os.path.join(dirpath, filename)
with open(txtfile_full_path) as f:
txt_list.append(f.read())
It should be easy enough to integrate the segregation based on your regexes now...

Related

How to zip files that ends with certain extension

I want to get all files in a directory (I reached it after doing several for loops - hence fourth.path) that ends with .npy or with csv and then zip those files.
My code is running putting one file only in the zip file. What am I doing wrong?
I tried to change my indents, but no zip file is being created
import json
import os
import zipfile
import zlib
directory = os.path.join(os.getcwd(), 'recs')
radarfolder = 'RadarIfxAvian'
file = os.listdir(directory)
def r(p, name):
p = os.path.join(p, name)
return p.replace("/", "\\")
#This code will list all json files in e ach file
for first in os.scandir(directory):
if first.is_dir():
for second in os.scandir(first.path):
if second.is_dir():
for third in os.scandir(second.path):
if third.is_dir():
radar_folder_name = ''
list_files = ()
for fourth in os.scandir(third.path):
if fourth.is_dir():
if radarfolder in fourth.path:
radar_folder_name = fourth.path
print(radar_folder_name)
list_files = ()
for file in os.listdir(fourth.path):
if file.endswith(".npy") | file.endswith(".csv"):
list_files = (file)
print(list_files)
with zipfile.ZipFile(radar_folder_name +'\\' +'radar.zip', 'w', compression=zipfile.ZIP_DEFLATED ) as zipMe:
zipMe.write(radar_folder_name +'\\' +list_files)
zipMe.close()
I tried to change my indents either resulting in error: TypeError: can only concatenate str (not "tuple") to str or no zip file being created

As I said in my second comment, your problem comes from the 'w' argument in your zipping statement. It causes the zip to be overwritten every time it's opened, which you do for each file you zip in. You can fix this 2 ways (at least):
Replace 'w' with 'a'; this way the files will be appended to your zip (with the side effect that, if you do this several times, files will be added more than once).
Keep the 'w', but only open the zip once, having listed all the files you want to zip before. See my code below.
I've taken the liberty to rewrite the part of your code where you look for the 'RadarIfxAvian' folder, since embedded for are clumsy (and if your folder structure changes, they might not work), replacing it with a multi-purpose recursive function.
Note that the folder structure will be included in the .zip; if you want to zip only the files themselves, consider doing os.chdir(radar_folder_name) before zipping the files.
# This function recursively looks for the 'filename' file or folder
# under 'start_path' and returns the full path, or an empty string if not found.
def find_file(start_path, filename):
if filename in os.listdir(start_path):
return start_path + '/' + filename
for file in os.scandir(start_path):
if not file.is_dir():
continue
if (deep_path:=find_file(start_path + '/' + file.name, filename)):
return deep_path
return ''
directory = os.path.join(os.getcwd(), 'recs')
radarfolder = 'RadarIfxAvian'
radar_folder_name = find_file(directory, radarfolder)
print(radar_folder_name)
list_files = []
for file in os.listdir(radar_folder_name):
if file.endswith(".npy") or file.endswith(".csv"):
list_files.append(file)
with zipfile.ZipFile(radar_folder_name + '/' + 'radar.zip', 'w', compression=zipfile.ZIP_DEFLATED ) as zipMe:
for file in list_files:
zipMe.write(radar_folder_name + '/' + file)

If I understand your code correctly, you are looking for a folder "RadarIfxAvian" and want to place a .ZIP in that folder containing any .CSV or .NPY files in that directory. This should do the equivalent, using os.walk for the recursive search:
import os
import zipfile
for path, dirs, files in os.walk('recs'):
if os.path.basename(path) == 'RadarIfxAvian':
print(path)
with zipfile.ZipFile(os.path.join(path, 'radar.zip'), 'w', zipfile.ZIP_DEFLATED) as zip:
for file in files:
if file.endswith(".npy") | file.endswith(".csv"):
print(file)
zip.write(file)
break # stop search once the directory is found and processed

I adjusted my code with the following steps:
Put the if in a function
writing the the zip by looping over each item in the list I appended
import json
import os
import glob
import zipfile
import zlib
directory = os.path.join(os.getcwd(), 'recs')
radarfolder = 'RadarIfxAvian'
file = os.listdir(directory)
list_files = []
def r(p, name):
p = os.path.join(p, name)
return p.replace("/", "\\")
def tozip(path, file):
filestozip = []
if file.endswith(".npy") or file.endswith(".csv"):
filestozip = (path + '\\' + file)
list_files.append(filestozip)
return list_files
#This code will list all json files in each file
for first in os.scandir(directory):
if first.is_dir():
for second in os.scandir(first.path):
if second.is_dir():
for third in os.scandir(second.path):
if third.is_dir():
radar_folder_name = ''
filestozip = []
list_files.clear()
for fourth in os.scandir(third.path):
if fourth.is_dir():
if radarfolder in fourth.path:
radar_folder_name = fourth.path
for file in os.listdir(fourth.path):
filestozip = tozip(radar_folder_name,file)
print(filestozip)
ZipFile = zipfile.ZipFile(r(radar_folder_name,"radar.zip"), "w")
for a in filestozip:
ZipFile.write(a, compress_type= zipfile.ZIP_DEFLATED)
print(radar_folder_name + "added to zip")

I need to find all text files in all subfolders created on a certain date, open them, and copy the contents into a single text file

My goal is to take the contents of all text files in subfolders created today and move them to a single existing report.txt but I can't seem to find a good way to go about it. I'm not very experienced in coding so any help would be much appreciated. Here is what I have so far (I know it's rubbish):
if getmtime == today:
with open(glob.iglob(drive + "://CADIQ//CADIQ_JOBS//?????????????????????")) as f:
for line in f:
content += line
with open(reportFile, "a") as f:
f.write(content)

Try this, based on How do I list all files of a directory?
import os, time
def last_mod_today(path):
'''
return True if getmtime and time have year, mon, day coincinding in their localtime struct, False else
'''
t_s = time.localtime(os.path.getmtime(path))
today = time.localtime(time.time())
return t_s.tm_mday==today.tm_mday and t_s.tm_year == today.tm_year and t_s.tm_mon == today.tm_mon
name_to_path = lambda d,x:os.path.normpath(os.path.join(os.path.join(os.getcwd(), d),x))
def log_files(d):
'''
walking through the files in d
log the content of f when last modif time for f is today
WARNING : what happens when the file is a JPEG ?
'''
scand_dir = os.path.join(os.getcwd(), d)
print(f"scanning {scand_dir}...")
(_, _, filenames) = next(os.walk(scand_dir))
log = open("log.txt", 'a')
for f in filenames:
if last_mod_today(name_to_path(d,f)):
with open(name_to_path(d,f), 'r') as todays_file:
log.write('##############################\n')
log.write(f"file : {name_to_path(d,f)}\n")
log.write(todays_file.read())
log.write('\n')
log.write('##############################\n')
log.close()
#first scanning files in the current directory
(_, dirnames, _) = next(os.walk('./'))
log_files('./')
#then crawling through the subdirs (one level)
for d in dirnames:
log_files(d)

I would start by creating a desired_date object, which is a datetime.date. You can then format that date into a string, which makes up the pattern you want to look for in your glob. The glob pattern doesn't care about the time, just the date.
from pathlib import Path
import datetime
desired_date = datetime.date(year=2020, month=12, day=22)
pattern = "13.2.1_" + desired_date.strftime("%y_%m_%d") + "_*"
for path in Path("path/to/folders").glob(pattern):
if not path.is_dir():
continue
print(path)
From there, you can visit each path, glob all text files in the current path, and accumulate the lines in each text file. Finally, write everything to one file.

import glob
contents = b''
for file in glob.glob('./*/*.txt'): # u can change as per your directory
fname = file.split(r'\\')[-1]
with open(fname, 'rb') as f1:
contents += f1.read()
with open('report.txt','wb') as rep:
rep.write(contents)
Hope this helps so :)
Better try to read or write files in terms of bytes because sometimes there may be a chance of corrupting data.

How to get the filename in directory with the max number in the filename python?

I have some xml files in a folder as example 'assests/2020/2010.xml', 'assests/2020/20005.xml', 'assests/2020/20999.xml' etc. I want to get the filename with max value in the '2020' folder. For above three files output should be 20999.xml
I am trying as following:
import glob
import os
list_of_files = glob.glob('assets/2020/*')
# latest_file = max(list_of_files, key=os.path.getctime)
# print (latest_file)
I couldn't be able to find logic to get the required file.
Here is the resource that have best answer to my query but I couldn't build my logic.

You can use pathlib to glob for the xml files and access the Path object attributes like .name and .stem:
from pathlib import Path
list_of_files = Path('assets/2020/').glob('*.xml')
print(max((Path(fn).name for fn in list_of_files), key=lambda fn: int(Path(fn).stem)))
Output:
20999.xml

I can't test it out right now, but you may try this:
files = []
for filename in list_of_files:
filename = str(filename)
filename = filename.replace('.xml','') #Assuming it's not printing your complete directory path
filename = int(filename)
files += [filename]
print(files)
This should get you your filenames in integer format and now you should be able to sort them in descending order and get the first item of the sorted list.

Use re to search for the appropriate endings in your file paths. If found use re again to extract the nr.
import re
list_of_files = [
'assests/2020/2010.xml',
'assests/2020/20005.xml',
'assests/2020/20999.xml'
]
highest_nr = -1
highest_nr_file = ''
for f in list_of_files:
re_result = re.findall(r'\d+\.xml$', f)
if re_result:
nr = int(re.findall(r'\d+', re_result[0])[0])
if nr > highest_nr:
highest_nr = nr
highest_nr_file = f
print(highest_nr_file)
Result
assests/2020/20999.xml

You can also try this way.
import os, re
path = "assests/2020/"
files =[
"assests/2020/2010.xml",
"assests/2020/20005.xml",
"assests/2020/20999.xml"
]
n = [int(re.findall(r'\d+\.xml$',file)[0].split('.')[0]) for file in files]
output = str(max(n))+".xml"
print("Biggest max file name of .xml file is ",os.path.join(path,output))
Output:
Biggest max file name of .xml file is assests/2020/20999.xml

import glob
xmlFiles = []
# this will store all the xml files in your directory
for file in glob.glob("*.xml"):
xmlFiles.append(file[:4])
# this will print the maximum one
print(max(xmlFiles))

How to search through both zipped and unzipped folders for a specific line

I'm trying to implement a Python script that takes a folder from the user (can be zipped or unzipped), and search through all the files in the folder to output the specific lines that my regular expression matches. My code below works for regular unzipped folders, but I can't figure out how to do the same with zipped folders that are inputted to function. Below are my code, thanks in advance!
def myFunction(folder_name):
path = folder_name
for (path, subdirs, files) in os.walk(path):
files = [f for f in os.listdir(path) if f.endswith('.txt') or f.endswith('.log') or f.endswith('-release') or f.endswith('.out') or f.endswith('messages') or f.endswith('.zip')] # Specify here the format of files you hope to search from (ex: ".txt" or ".log")
files.sort() # file is sorted list
files = [os.path.join(path, name) for name in files] # Joins the path and the name, so the files can be opened and scanned by the open() function
# The following for loop searches all files with the selected format
for filename in files:
#print('start parsing... ' + str(datetime.datetime.now()))
matched_line = []
try:
with open(filename, 'r', encoding = 'utf-8') as f:
f = f.readlines()
except:
with open(filename, 'r') as f:
f = f.readlines()
# print('Finished parsing... ' + str(datetime.datetime.now()))
for line in f:
#0strip out \x00 from read content, in case it's encoded differently
line = line.replace('\x00', '')
RE2 = r'^Version: \d.+\d.+\d.\w\d.+'
RE3 = r'^.+version.(\d+.\d+.\d+.\d+)'
pattern2 = re.compile('('+RE2+'|'+RE3+')', re.IGNORECASE)
for match2 in pattern2.finditer(line):
matched_line.append(line)
print(line)
#Calling the function to use it
myFunction(r"SampleZippedFolder.zip")
The try and except block of my code was my attempt to open the zipped folder and read it. I'm still not very clear with how to open the zipped folder or how it works. Please let me know how I can modify my code to make it work, much appreciated!

One possibility is first determine what object type folder_name is using zipfile and os.isdir() and whichever one succeeds, get the list of files and proceed. Maybe something like this:
import zipfile, os, re
def myFunction(folder_name):
files = None # nothing yet
path = folder_name
if zipfile.is_zipfile(path):
print('ZipFile: {}'.format(path))
f = zipfile.ZipFile(path)
files = f.namelist()
# for name in f.namelist(): # debugging
# print('file: {}'.format(name))
elif os.path.isdir(path):
print('Folder: {}'.format(path))
files = os.listdir(path)
# for name in os.listdir(path): # debugging
# print('file: {}'.format(name))
# should now have a list of files
# proceed processing the files
for filename in files:
...

Python - Loop through list within regex

Right, i'm relatively new to Python, which you will likely see in my code, but is there any way to iterate through a list within regex?
Basically, i'm looping through each filename within a folder, getting a code (2-6 digits) from the filename, and i'm wanting to compare it with a list of codes in a text file, which have a name attached, in the format "1234_Name" (without the quotation marks). If the code exists in both lists, I want to print out the list entry, i.e. 1234_Name. Currently my code only seems to look at the first entry in the text file's list and i'm not sure how to make it look through them all to find matches.
import os, re
sitesfile = open('C:/Users/me/My Documents/WORK_PYTHON/Renaming/testnames.txt', 'r')
filefolder = r'C:/Users/me/My Documents/WORK_PYTHON/Renaming/files/'
sites = sitesfile.read()
site_split = re.split('\n', sites)
old = []
newname = []
for site in site_split:
newname.append(site)
for root, dirs, filenames in os.walk(filefolder):
for filename in filenames:
fullpath = os.path.join(root, filename)
filename_split = os.path.splitext(fullpath)
filename_zero, fileext = filename_split
filename_zs = re.split("/", filename_zero)
filenm = re.search(r"[\w]+", str(filename_zs[-1:]))#get only filename, not path
filenmgrp = filenm.group()
pacode = re.search('\d\d+', filenmgrp)
if pacode:
pacodegrp = pacode.group()
match = re.match(pacodegrp, site)
if match:
print site
Hope this makes sense - thanks a lot in advance!

So, use this code instead:
import os
import re
def locate(pattern = r'\d+[_]', root=os.curdir):
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in re.findall(pattern, ' '.join(files)):
yield os.path.join(path, filename)
..this will only return files in a folder that match a given regex pattern.
with open('list_file.txt', 'r') as f:
lines = [x.split('_')[0] for x in f.readlines()]
print_out = []
for f in locate(<your code regex>, <your directory>):
if f in lines: print_out.append(f)
print(print_out)
...find the valid codes in your list_file first, then compare the files that come back with your given regex.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python - Iterating over all text files recursively - python

Related

How to zip files that ends with certain extension

I need to find all text files in all subfolders created on a certain date, open them, and copy the contents into a single text file

How to get the filename in directory with the max number in the filename python?

How to search through both zipped and unzipped folders for a specific line

Python - Loop through list within regex

Categories

Resources