My data is organized as such:
I have 30 folders. In each of them, 3 subfolders. In each of them, one file.
I would like to write a script that writes, in a text file 1 located in folder 1, the paths to the files located in the subfolders of this folder 1; and so on for every other folder.
The problem is that the script only writes, in each text file, the 3rd file (file in subfolder 3) rather than the files in subfolders 1, 2, 3.
This is what I tried:
import glob
import os
gotofolders = '/path/to/folderslocation/'
foldersname = open('/path/to/foldersname.txt').read().split()
for folders in foldersname:
foldersdirectory = os.path.join(gotofolders,foldersname)
filepaths = glob.glob(os.path.join(foldersdirectory)+'*subfolders/*files')
for filepath in filepaths:
savethepaths = os.path.join(foldersdirectory)+'files_path_in_that_folder.txt'
with open (savethepaths,'w') as f:
f.write(filepath+'\n')
As said, it almost works, excepts that in each 'files_path_in_that_folder.txt' I have the 3rd element of the "filepath" list, rather than all 3 elements.
Thanks!
Okay, I figured it out; I had to add:
with open (savethepaths,'w') as f:
f.writelines(list("%s\n" %filepath for filepath in filepaths))
import os
def directory_into_file(_path, file_obj, depth):
# depth is a string of asterisk, just for better printing. starts with empty string
file_obj.write(depth + _path + '\n')
if(os.path.isdir(_path)):
file_list = os.listdir(_path)
os.chdir(_path)
for file in file_list:
directory_into_file(file, file_obj, depth+'*')
os.chdir("..")
this should work.
_path - the path of the directory,
file_obj - send the object file to the function and first,
depth - at first call send an empty string
hope this would work. didn't try it myself...
Related
I have a folder which has a text files in it. I want to be able to put in a path to this file and have python go through the folder, open each file and append its content to a list.
import os
folderpath = "/Users/myname/Downloads/files/"
inputlst = [os.listdir(folderpath)]
filenamelist = []
for filename in os.listdir(folderpath):
if filename.endswith(".txt"):
filenamelist.append(filename)
print(filename list)
So far this outputs:
['test1.txt', 'test2.txt', 'test3.txt', 'test4.txt', 'test5.txt', 'test6.txt', 'test7.txt', 'test8.txt', 'test9.txt', 'test10.txt']
I want to have the code take each of these files, open them and put all of its content into a single huge list not just print the file name. Is there any way to do this?
You should use file open for this.
Read here a documentation about its advanced options
Anyway, here is one way how you can do it:
import os
folderpath = r"yourfolderpath"
inputlst = [os.listdir(folderpath)]
filenamecontent = []
for filename in os.listdir(folderpath):
if filename.endswith(".txt"):
f = open(os.path.join(folderpath,filename), 'r')
filenamecontent.append(f.read())
print(filenamecontent)
If you are using Python3, you can use :
for filename in filename_list :
with open(filename,"r") as file_handler :
data = file_handler.read()
Please do mind that you will need the full (either relative or absolute) path to your file in filename
This way, your file handler will be automatically closed when you get out of the with scope.
More information around here : https://docs.python.org/fr/3/library/functions.html#open
On a side note, in order to list files, you might want to have a look to glob and use :
filename_list = glob.glob("/path/to/files/*.txt")
You can use fileinput
Code:
import fileinput
folderpath = "your_path_to_directory_where_files_are_stored"
file_list = [a for a in os.listdir(folderpath) if a.endswith(".txt")]
# This will return all the files which are in .txt format
get_all_files = fileinput.input(file_list)
with open("alldata.txt", 'ab+') as writefile:
for line in get_all_files:
writefile.write(line+'\n')
The above code will read all the data from .txt from a specified directory(folderpath) and store it in alldata.txt So, you wanted to have that long list, that list is now stored in .txt file if you want, else you can remove the write process.
Links:
https://docs.python.org/3/library/fileinput.html
https://docs.python.org/3/library/functions.html#open
I'd like to read the contents of every file in a folder/directory and then print them at the end (I eventually want to pick out bits and pieces from the individual files and put them in a separate document)
So far I have this code
import os
path = 'results/'
fileList = os.listdir(path)
for i in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines = file.readlines()
print(allLines)
at the end I dont get any errors but it only prints the contents of the last file in my folder in a series of strings and I want to make sure its reading every file so I can then access the data I want from each file. I've looked online and I cant find where I'm going wrong. Is there any way of making sure the loop is iterating over all my files and reading all of them?
also i get the same result when I use
file = open(os.path.join('results/',i), 'r')
in the 5th line
Please help I'm so lost
Thanks!!
Separate the different functions of the thing you want to do.
Use generators wherever possible. Especially if there are a lot of files or large files
Imports
from pathlib import Path
import sys
Deciding which files to process:
source_dir = Path('results/')
files = source_dir.iterdir()
[Optional] Filter files
For example, if you only need files with extension .ext
files = source_dir.glob('*.ext')
Process files
def process_files(files):
for file in files:
with file.open('r') as file_handle :
for line in file_handle:
# do your thing
yield line
Save the lines you want to keep
def save_lines(lines, output_file=sys.std_out):
for line in lines:
output_file.write(line)
you forgot indentation at this line allLines = file.readlines()
and maybe you can try that :
import os
allLines = []
path = 'results/'
fileList = os.listdir(path)
for file in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines.append(file.read())
print(allLines)
You forgot to indent this line allLines.append(file.read()).
Because it was outside the loop, it only appended the file variable to the list after the for loop was finished. So it only appended the last value of the file variable that remained after the loop. Also, you should not use readlines() in this way. Just use read() instead;
import os
allLines = []
path = 'results/'
fileList = os.listdir(path)
for file in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines.append(file.read())
print(allLines)
This also creates a file containing all the files you wanted to print.
rootdir= your folder, like 'C:\\Users\\you\\folder\\'
import os
f = open('final_file.txt', 'a')
for root, dirs, files in os.walk(rootdir):
for filename in files:
data = open(full_name).read()
f.write(data + "\n")
f.close()
This is a similar case, with more features: Copying selected lines from files in different directories to another file
I am having a hard time looping through files in a directory that is different from the directory where the script was written. I also ideally would want my script through go to through all files that start with sasa. There are a couple of files in the folder such as sasa.1, sasa.2 etc... as well as other files such as doc1.pdf, doc2.pdf
I use Python Version 2.7 with windows Powershell
Locations of Everything
1) Python Script Location ex: C:Users\user\python_project
2) Main_Directory ex: C:Users\user\Desktop\Data
3) Current_Working_Directory ex: C:Users\user\python_project
Main directory contains 100 folders (folder A, B, C, D etc..)
Each of these folders contains many files including the sasa files of interest.
Attempts at running script
For 1 file the following works:
Script is run the following way: python script1.py
file_path = 'C:Users\user\Desktop\Data\A\sasa.1
def writing_function(file_path):
with open(file_path) as file_object:
lines = file_object.readlines()
for line in lines:
print(lines)
writing_function(file_path)
However, the following does not work
Script is run the following way: python script1.py A sasa.1
import os
import sys
from os.path import join
dr = sys.argv[1]
file_name = sys.argv[2]
file_path = 'C:Users\user\Desktop\Data'
new_file_path = os.path.join(file_path, dr)
new_file_path2 = os.path.join(new_file_path, file_name)
def writing_function(paths):
with open(paths) as file_object:
lines = file_object.readlines()
for line in lines:
print(line)
writing_function(new_file_path2)
I get the following error:
with open(paths) as file_object:
IO Error: [Errno 2] No such file or directory:
'C:Users\\user\\Desktop\\A\\sasa.1'
Please note right now I am just working on one file, I want to be able to loop through all of the sasa files in the folder.
It can be something in the line of:
import os
from os.path import join
def function_exec(file):
code to execute on each file
for root, dirs, files in os.walk('path/to/your/files'): # from your argv[1]
for f in files:
filename = join(root, f)
function_exec(filename)
Avoid using the variable dir. it is a python keyword. Try print(dir(os))
dir_ = argv[1] # is preferable
No one mentioned glob so far, so:
https://docs.python.org/3/library/glob.html
I think you can solve your problem using its ** magic:
If recursive is true, the pattern “**” will match any files and zero
or more directories and subdirectories. If the pattern is followed by
an os.sep, only directories and subdirectories match.
Also note you can change directory location using
os.chdir(path)
I would like to read the all the files in a directory so I'm doing the following:
path = '/thepath/of/the/files/*'
files = glob.glob(path)
for file in files:
print file
The problem is that when I print the files I don't obtain anything; any idea of how to return all the content of the files in a list per file?
EDIT: I appended the path with an asterisk, this should give you all the files and directories in that path.
Like in the comment I posted some time ago, this should work:
contents=[open(ii).read() for ii in glob.glob(path)]
or this, if you want a dictionary instead:
contents={ii : open(ii).read() for ii in glob.glob(path)}
I would do something like the following to only get files.
import os
import glob
path = '/thepath/of/the/files/*'
files=glob.glob(path)
for file in files:
if os.path.isfile(file):
print file
Your question is kind of unclear, but as I understand it, you'd like to get the contents of all the files in the directory. Try this:
# ...
contents = {}
for file in files:
with open(file) as f:
contents[file] = f.readlines()
print contents
This creates a dict where the key is the file name, and the value is the contents of the file.
I am looking for a way to unzip nested zip files in python. For example, consider the following structure (hypothetical names for ease):
Folder
ZipfileA.zip
ZipfileA1.zip
ZipfileA2.zip
ZipfileB.zip
ZipfileB1.zip
ZipfileB2.zip
...etc. I am trying to access text files that are within the second zip. I certainly don't want to extract everything, as the shear numbers would crash the computer (there is several hundred zips in the first layer, and almost 10,000 in the second layer (per zip)).
I have been playing around with the 'zipfile' module - I am able open the 1st level of zipfiles. E.g.:
zipfile_obj = zipfile.ZipFile("/Folder/ZipfileA.zip")
next_layer_zip = zipfile_obj.open("ZipfileA1.zip")
However, this returns a "ZipExtFile" instance (not a file or zipfile instance) - and I can't then go on and open this particular data type. That I can't do this:
data = next_layer_zip.open(data.txt)
I can however "read" this zip file file with:
next_layer_zip.read()
But this is entirely useless! (i.e. can only read compressed data/goobledigook).
Does anyone have any ideas on how I might go about this (without using ZipFile.extract)??
I came across this, http://pypi.python.org/pypi/zip_open/ - which looks to do exactly what I want, but it doesn't seem to work for me. (keep getting "[Errno 2] No such file or directory:" for the files I am trying to process, using that module).
Any ideas would be much appreciated!! Thanks in advance
ZipFile needs a file-like object, so you can use StringIO to turn the data you read from the nested zip into such an object. The caveat is that you'll be loading the full (still compressed) inner zip into memory.
with zipfile.ZipFile('foo.zip') as z:
with z.open('nested.zip') as z2:
z2_filedata = cStringIO.StringIO(z2.read())
with zipfile.ZipFile(z2_filedata) as nested_zip:
print nested_zip.open('data.txt').read()
Unfortunately decompressing zip files requires random access to the archive, and the ZipFile methods (not to mention the DEFLATE algorithm itself) only provide streams. It is therefore impossible to decompress nested zip files without extracting them.
Here's a function I came up with.
def extract_nested_zipfile(path, parent_zip=None):
"""Returns a ZipFile specified by path, even if the path contains
intermediary ZipFiles. For example, /root/gparent.zip/parent.zip/child.zip
will return a ZipFile that represents child.zip
"""
def extract_inner_zipfile(parent_zip, child_zip_path):
"""Returns a ZipFile specified by child_zip_path that exists inside
parent_zip.
"""
memory_zip = StringIO()
memory_zip.write(parent_zip.open(child_zip_path).read())
return zipfile.ZipFile(memory_zip)
if ('.zip' + os.sep) in path:
(parent_zip_path, child_zip_path) = os.path.relpath(path).split(
'.zip' + os.sep, 1)
parent_zip_path += '.zip'
if not parent_zip:
# This is the top-level, so read from disk
parent_zip = zipfile.ZipFile(parent_zip_path)
else:
# We're already in a zip, so pull it out and recurse
parent_zip = extract_inner_zipfile(parent_zip, parent_zip_path)
return extract_nested_zipfile(child_zip_path, parent_zip)
else:
if parent_zip:
return extract_inner_zipfile(parent_zip, path)
else:
# If there is no nesting, it's easy!
return zipfile.ZipFile(path)
Here's how I tested it:
echo hello world > hi.txt
zip wrap1.zip hi.txt
zip wrap2.zip wrap1.zip
zip wrap3.zip wrap2.zip
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap1.zip').open('hi.txt').read()
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap2.zip/wrap1.zip').open('hi.txt').read()
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap3.zip/wrap2.zip/wrap1.zip').open('hi.txt').read()
For those looking for a function that extracts a nested zip file (any level of nesting) and cleans up the original zip files:
import zipfile, re, os
def extract_nested_zip(zippedFile, toFolder):
""" Unzip a zip file and its contents, including nested zip files
Delete the zip file(s) after extraction
"""
with zipfile.ZipFile(zippedFile, 'r') as zfile:
zfile.extractall(path=toFolder)
os.remove(zippedFile)
for root, dirs, files in os.walk(toFolder):
for filename in files:
if re.search(r'\.zip$', filename):
fileSpec = os.path.join(root, filename)
extract_nested_zip(fileSpec, root)
I use python 3.7.3
import zipfile
import io
with zipfile.ZipFile('all.zip') as z:
with z.open('nested.zip') as z2:
z2_filedata = io.BytesIO(z2.read())
with zipfile.ZipFile(z2_filedata) as nested_zip:
print( nested_zip.open('readme.md').read())
This works for me. Just place this script with the nested zip under the same directory. It will also count the total number of files within the nested zip as well
import os
from zipfile import ZipFile
def unzip (path, total_count):
for root, dirs, files in os.walk(path):
for file in files:
file_name = os.path.join(root, file)
if (not file_name.endswith('.zip')):
total_count += 1
else:
currentdir = file_name[:-4]
if not os.path.exists(currentdir):
os.makedirs(currentdir)
with ZipFile(file_name) as zipObj:
zipObj.extractall(currentdir)
os.remove(file_name)
total_count = unzip(currentdir, total_count)
return total_count
total_count = unzip ('.', 0)
print(total_count)
My approach to such a problem is this, includes self-assigned objects:
import os
import re
import zipfile
import pandas as pd
# import numpy as np
path = r'G:\Important\Data\EKATTE'
# DESCRIBE
archives = os.listdir(path)
archives = [ar for ar in archives if ar.endswith(".zip")]
contents = pd.DataFrame({'elec_date':[],'files':[]})
for a in archives:
archive = zipfile.ZipFile( path+'\\'+a )
filelist = archive.namelist()
# archive.infolist()
for i in archive.namelist():
if re.match('.*zip', i):
sub_arch = zipfile.ZipFile(archive.open(i))
sub_names = [x for x in sub_arch.namelist()]
for s in sub_names:
exec(f"{s.split('.')[0]} = pd.read_excel(sub_arch.open(s), squeeze=True)")
The archive can be found on Bulgaria's National Statistics Institute page (direct link):
https://www.nsi.bg/sites/default/files/files/EKATTE/Ekatte.zip