How to concatenate a large number of text files in python - python

I have 367 text files I want to concatenate into one long text file. I was thinking I could go about this by doing:
filenames = ["file1.txt","file2.txt","file3.txt"...]
with open("output_file.txt","w") as outfile:
for filename in filenames:
with open(filename) as infile:
contents = infile.read()
outfile.write(contents)
However, this would require me to write out every single text file name, which would be very tedious. Is there an easier way to concatenate a large number of text files using Python? Thank you!

Use glob.glob to build list files and shutil.copyfileobj for a more efficient copy.
import shutil
import glob
filenames = glob.glob("*.txt") # or "file*.txt"
with open("output_file.txt", "wb") as outfile:
for filename in filenames:
with open(filename, "rb") as infile:
shutil.copyfileobj(infile, outfile)

Assuming these files all have a counter (1-367) in the format filename-count-.extension, we can generate 367 files using list comprehension.
filenames = [f'file{i}.txt' for i in range(1,368)]
with open("output_file.txt","w") as outfile:
for filename in filenames:
with open(filename) as infile:
contents = infile.read()
outfile.write(contents)
another option you could use is os.listdir.
import os
for i in os.listdir('files'):
print(i)
output
text1.txt
text2.txt

Related

Merge multiple files from server folder into one

I have tried to merge some files in server folder into a new file, saving under same server folder.
In my below script, I keep receiving unexpected indent error. I would like to seek some expert guidance.
import pandas as pd # import need to be in lower case
import numpy as np
import openpyxl
from openpyxl.workbook import workbook #save to excel doc
#>>> 1.1 Define common file path and filename
path= '\hbap.adroot.abb\HK\Finance\00210602\AMH_A2R\1KY\Drv Reengine\Python\'
#>>> 1.2 Define list of files
filenames = [path+'100_6.xlsx', path+'101_6.xlsx']
# Open file3 in write mode
with Open(r path+'file3.xlsx','w') as outfile:
# Iterate through list
for names in filenames:
# Open each file in read mode
with open(names) as infile:
# read the data from file1 and
# file2 and write it in file3
outfile.write(infile.read())
# Add '\n' to enter data of file2
# from next line
outfile.write("\n")
please take a look at proposed solution.
In this case I take all files present in /sql directory, read all of them one by one and append the result to the output file.
import os
files_list = list()
output = r"E:\Downloads\output.txt"
for (dirpath, dirnames, filenames) in os.walk(r'E:\Downloads\sql'):
files_list += [os.path.join(dirpath, file) for file in filenames]
for file in files_list:
fin = open(file, "rt")
data = fin.read()
fin.close()
fin = open(output, "a+")
fin.write(data)
fin.write("\n ---------- \n")
fin.close()
Also I might suggest that you are dealing with .xlsx files which is a bit different topic and merging excel files should be treated in another way.
Having a correct indentation is important in python as the interpreter uses it to read the code.
This has an ordered indent level:
# Open file3 in write mode
with Open(r path+'file3.xlsx','w') as outfile:
# Iterate through list
for names in filenames:
# Open each file in read mode
with open(names) as infile:
# read the data from file1 and
# file2 and write it in file3
outfile.write(infile.read())
# Add '\n' to enter data of file2
# from next line
outfile.write("\n")

How open file based on extension?

I want to open any .txt file in the same directory.
In ruby I can do
File.open("*.txt").each do |line|
puts line
end
In python I can't do this it will give an error
file = open("*.txt","r")
print(file.read())
file.close()
It gives an error invalid argument.
So is there any way around it?
You can directly use the glob module for this
import glob
for file in glob.glob('*.txt'):
with open(file, 'r') as f:
print(f.read())
Use os.listdir to list all files in the current directory.
all_files = os.listdir()
Then, filter the ones which have the extension you are looking for and open each one of them in a loop.
for filename in all_files:
if filename.lower().endswith('.txt'):
with open(filename, 'rt') as f:
f.read()

How to open bunch of .txt files in a one string via Python

It's clear for me how to open one file and it's pretty straight forward by using open() function just like this:
with open('number.txt', 'rb') as myfile:
data=myfile.read()
But what will be my actions if I want to open 5 .txt files and also view them as a string in Python? Should I somehow use os.listdir() possibilities?
Here a flexible/reusable approach for doing exactly what you need:
def read_files(files):
for filename in files:
with open(filename, 'rb') as file:
yield file.read()
def read_files_as_string(files, separator='\n'):
files_content = list(read_files(files=files))
return separator.join(files_content)
# build your files list as you need
files = ['f1.txt', 'f2.txt', 'f3.txt']
files_content_str = read_files_as_string(files)
print(files_content_str)
Looks like you need.
import os
path = "your_path"
for filename in os.listdir(path):
if filename.endswith(".txt"):
with open(os.path.join(path, filename), 'rb') as myfile:
data=myfile.read()

Concatenate .txt files. Write content in one .txt file

I have some .txt files in a folder. I need to collect their content all in one .txt file. I'm working with Python and tried:
import os
rootdir = "\\path_to_folder\\"
for files in os.walk(rootdir):
with open ("out.txt", 'w') as outfile:
for fname in files:
with open(fname) as infile:
for line in infile:
outfile.write(line)
but did not work. The 'out.txt' is generated but the code never ends. Any advice? Thanks in advance.
os.walk returns tuples, not filenames:
with open ("out.txt", 'w') as outfile:
for root, dirs, files in os.walk(rootdir):
for fname in files:
with open(os.path.join(root, fname)) as infile:
for line in infile:
outfile.write(line)
Also you should open outfile in the beginning, not in each loop.
This solved my problem. The generated 'out.txt' file is just 151KB.
file_list = os.listdir("\\path_to_folder\\")
with open('out.txt', 'a+') as outfile:
for fname in file_list:
with open(fname) as infile:
outfile.write(infile.read())
Thanks everybody.

How do I concatenate text files in Python?

I have a list of 20 file names, like ['file1.txt', 'file2.txt', ...]. I want to write a Python script to concatenate these files into a new file. I could open each file by f = open(...), read line by line by calling f.readline(), and write each line into that new file. It doesn't seem very "elegant" to me, especially the part where I have to read/write line by line.
Is there a more "elegant" way to do this in Python?
This should do it
For large files:
filenames = ['file1.txt', 'file2.txt', ...]
with open('path/to/output/file', 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
For small files:
filenames = ['file1.txt', 'file2.txt', ...]
with open('path/to/output/file', 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
outfile.write(infile.read())
… and another interesting one that I thought of:
filenames = ['file1.txt', 'file2.txt', ...]
with open('path/to/output/file', 'w') as outfile:
for line in itertools.chain.from_iterable(itertools.imap(open, filnames)):
outfile.write(line)
Sadly, this last method leaves a few open file descriptors, which the GC should take care of anyway. I just thought it was interesting
Use shutil.copyfileobj.
It automatically reads the input files chunk by chunk for you, which is more more efficient and reading the input files in and will work even if some of the input files are too large to fit into memory:
import shutil
with open('output_file.txt','wb') as wfd:
for f in ['seg1.txt','seg2.txt','seg3.txt']:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
That's exactly what fileinput is for:
import fileinput
with open(outfilename, 'w') as fout, fileinput.input(filenames) as fin:
for line in fin:
fout.write(line)
For this use case, it's really not much simpler than just iterating over the files manually, but in other cases, having a single iterator that iterates over all of the files as if they were a single file is very handy. (Also, the fact that fileinput closes each file as soon as it's done means there's no need to with or close each one, but that's just a one-line savings, not that big of a deal.)
There are some other nifty features in fileinput, like the ability to do in-place modifications of files just by filtering each line.
As noted in the comments, and discussed in another post, fileinput for Python 2.7 will not work as indicated. Here slight modification to make the code Python 2.7 compliant
with open('outfilename', 'w') as fout:
fin = fileinput.input(filenames)
for line in fin:
fout.write(line)
fin.close()
outfile.write(infile.read()) # time: 2.1085190773010254s
shutil.copyfileobj(fd, wfd, 1024*1024*10) # time: 0.60599684715271s
A simple benchmark shows that the shutil performs better.
I don't know about elegance, but this works:
import glob
import os
for f in glob.glob("file*.txt"):
os.system("cat "+f+" >> OutFile.txt")
If you have a lot of files in the directory then glob2 might be a better option to generate a list of filenames rather than writing them by hand.
import glob2
filenames = glob2.glob('*.txt') # list of all .txt files in the directory
with open('outfile.txt', 'w') as f:
for file in filenames:
with open(file) as infile:
f.write(infile.read()+'\n')
What's wrong with UNIX commands ? (given you're not working on Windows) :
ls | xargs cat | tee output.txt does the job ( you can call it from python with subprocess if you want)
An alternative to #inspectorG4dget answer (best answer to date 29-03-2016). I tested with 3 files of 436MB.
#inspectorG4dget solution: 162 seconds
The following solution : 125 seconds
from subprocess import Popen
filenames = ['file1.txt', 'file2.txt', 'file3.txt']
fbatch = open('batch.bat','w')
str ="type "
for f in filenames:
str+= f + " "
fbatch.write(str + " > file4results.txt")
fbatch.close()
p = Popen("batch.bat", cwd=r"Drive:\Path\to\folder")
stdout, stderr = p.communicate()
The idea is to create a batch file and execute it, taking advantage of "old good technology". Its semi-python but works faster. Works for windows.
Check out the .read() method of the File object:
http://docs.python.org/2/tutorial/inputoutput.html#methods-of-file-objects
You could do something like:
concat = ""
for file in files:
concat += open(file).read()
or a more 'elegant' python-way:
concat = ''.join([open(f).read() for f in files])
which, according to this article: http://www.skymind.com/~ocrow/python_string/ would also be the fastest.
If the files are not gigantic:
with open('newfile.txt','wb') as newf:
for filename in list_of_files:
with open(filename,'rb') as hf:
newf.write(hf.read())
# newf.write('\n\n\n') if you want to introduce
# some blank lines between the contents of the copied files
If the files are too big to be entirely read and held in RAM, the algorithm must be a little different to read each file to be copied in a loop by chunks of fixed length, using read(10000) for example.
def concatFiles():
path = 'input/'
files = os.listdir(path)
for idx, infile in enumerate(files):
print ("File #" + str(idx) + " " + infile)
concat = ''.join([open(path + f).read() for f in files])
with open("output_concatFile.txt", "w") as fo:
fo.write(path + concat)
if __name__ == "__main__":
concatFiles()
import os
files=os.listdir()
print(files)
print('#',tuple(files))
name=input('Enter the inclusive file name: ')
exten=input('Enter the type(extension): ')
filename=name+'.'+exten
output_file=open(filename,'w+')
for i in files:
print(i)
j=files.index(i)
f_j=open(i,'r')
print(f_j.read())
for x in f_j:
outfile.write(x)

Categories