Merge multiple files from server folder into one - python

I have tried to merge some files in server folder into a new file, saving under same server folder.
In my below script, I keep receiving unexpected indent error. I would like to seek some expert guidance.
import pandas as pd # import need to be in lower case
import numpy as np
import openpyxl
from openpyxl.workbook import workbook #save to excel doc
#>>> 1.1 Define common file path and filename
path= '\hbap.adroot.abb\HK\Finance\00210602\AMH_A2R\1KY\Drv Reengine\Python\'
#>>> 1.2 Define list of files
filenames = [path+'100_6.xlsx', path+'101_6.xlsx']
# Open file3 in write mode
with Open(r path+'file3.xlsx','w') as outfile:
# Iterate through list
for names in filenames:
# Open each file in read mode
with open(names) as infile:
# read the data from file1 and
# file2 and write it in file3
outfile.write(infile.read())
# Add '\n' to enter data of file2
# from next line
outfile.write("\n")

please take a look at proposed solution.
In this case I take all files present in /sql directory, read all of them one by one and append the result to the output file.
import os
files_list = list()
output = r"E:\Downloads\output.txt"
for (dirpath, dirnames, filenames) in os.walk(r'E:\Downloads\sql'):
files_list += [os.path.join(dirpath, file) for file in filenames]
for file in files_list:
fin = open(file, "rt")
data = fin.read()
fin.close()
fin = open(output, "a+")
fin.write(data)
fin.write("\n ---------- \n")
fin.close()
Also I might suggest that you are dealing with .xlsx files which is a bit different topic and merging excel files should be treated in another way.

Having a correct indentation is important in python as the interpreter uses it to read the code.
This has an ordered indent level:
# Open file3 in write mode
with Open(r path+'file3.xlsx','w') as outfile:
# Iterate through list
for names in filenames:
# Open each file in read mode
with open(names) as infile:
# read the data from file1 and
# file2 and write it in file3
outfile.write(infile.read())
# Add '\n' to enter data of file2
# from next line
outfile.write("\n")

Related

How do I read all the HTML files in a directory and write the content into a CSV file using Python?

I am trying to read all the HTML files in a directory and write them into a CSV file. Each row in the CSV file will contain the contents of one HTML file.
I seem to be able to read one HTML file and write into a CSV file.
import os, csv
import fnmatch
from pathlib import Path
directory = "directory/"
for dirpath, dirs, files in os.walk(directory):
for filename in fnmatch.filter(files, '*.html'):
with open(os.path.join(dirpath, filename)) as f:
html = f.read()
if 'apples and oranges' in html:
with open('output.csv', 'w') as f:
writer = csv.writer(f)
lines = [[html]]
for l in lines:
writer.writerow(l)
I currently only see one HTML file being printed out into one CSV row.
It's because you used 'w' or write mode.
with open('output.csv', 'w') as f:
That truncates the file and overwrites the previous contents.
You should be using 'a' or append mode.
with open('output.csv', 'a') as f:
That opens the file for writing if it doesn't yet exist, or appends to the end of the file if it already exists.
When using open('output.csv', 'w'), this means that you rewrite to the file everytime this is executed, so every loop iteration. It's like saving new material to a file with the same name. The material that was in the file before you saved won't be visible after. You need to use open('output.csv', 'a') so that the file your writing to is only appended to, instead of being written over. If you do use append though, you want to remove the file if it already exists first, otherwise you're going to be appending to your old results.
Here's a working example, I added some extra formatting stuff to achieve the output that you describe in the question
import os
import fnmatch
import re
directory = "directory/"
#Remove the output file if it exists, otherwise you'll have output from the previous execution
if (os.path.exists('output.csv')):
os.remove('output.csv')
for dirpath, dirs, files in os.walk(directory):
for filename in fnmatch.filter(files, '*.html'):
with open(os.path.join(dirpath, filename)) as f:
html = [line.rstrip("\n") for line in f] #Puts each line from the html file into a list
lines = "".join(html) #Concats that list into a single string
line = re.sub(" +", " ", lines) #Gets rid of superfluous whitespace, but substituting any chains of spaces " +" for just one single space
if re.search("apples and oranges", line):
with open('output.csv', 'a') as f: #Changed w (which stand for write) to append. With w the file is rewritten every time it's called. With a the file only has text appended to the end
f.write(line + ",\n")
#Removes the comma and newline at the end of the file
with open("output.csv", 'rb+') as filehandle:
filehandle.seek(-2, os.SEEK_END)
filehandle.truncate()

Append dbf files to one dbf file

I have many files in a directory, like ['FredrikstadAvst1.dbf', 'FredrikstadAvst2.dbf', ...]. I want to write a Python script to concatenate these files into a new "*.dbf" file.
I have a Python script that almost does the job. But on the output file it overwrites all the time. So when the job is finished the output file only containes of the last file that is in my directory.
Here is my script:
import os, glob, shutil
folder_path = r'C:\Tom\Oppdrag_2019\Pendle\2018'
for filename in glob.glob(os.path.join(folder_path, '*.dbf')):
fd = open(filename, 'r')
List = []
List.append(fd)
print filename
wfd = open(r"C:\Tom\Oppdrag_2019\Pendle\FredrikstadAvst.dbf",'a')
shutil.copyfileobj(fd, wfd, 1024*1024*10)
Consider the following:
import os, glob, shutil
folder_path = r'C:\Tom\Oppdrag_2019\Pendle\2018'
wfd = open(r"C:\Tom\Oppdrag_2019\Pendle\FredrikstadAvst.dbf",'w')
for filename in glob.glob(os.path.join(folder_path, '*.dbf')):
fd = open(filename, 'r')
shutil.copyfileobj(fd, wfd, 1024*1024*10)
fd.close()
wfd.close()
By opening the file before the loop and closing only after iterating over every dbf file, it shouldn't overwrite. I removed the List (which is a reserved keyword so try not to use it) because I can't see what it's being used for here.
Almost work now. But the header writes for every file. I just want the header to write the first time. How to skip the header for each time ?

Copying all files of a directory to one text file in python

My intention is to copy the text of all my c# (and later aspx) files to one final text file, but it doesn't work.
For some reason, the "yo.txt" file is not created.
I know that the iteration over the files works, but I can't write the data into the .txt file.
The variable 'data' eventually does contain all text from the files . . .
*******Could it be connected to the fact that there are some non-ascii characters in the text of the c# files?
Here is my code:
import os
import sys
src_path = sys.argv[1]
os.chdir(src_path)
data = ""
for file in os.listdir('.'):
if os.path.isfile(file):
if file.split('.')[-1]=="cs" and (len(file.split('.'))==2 or len(file.split('.'))==3):
print "Copying", file
with open(file, "r") as f:
data += f.read()
print data
with open("yo.txt", "w") as f:
f.write(data)
If someone has an idea, it will be great :)
Thanks
You have to ensure the directory the file is created has sufficient write permissions, if not run
chmod -R 777 .
to make the directory writable.
import os
for r, d, f in os.walk(inputdir):
for file in f:
filelist.append(f"{r}\\{file}")
with open(outputfile, 'w') as outfile:
for f in filelist:
with open(f) as infile:
for line in infile:
outfile.write(line)
outfile.write('\n \n')

How to sequentially read all the files in a directory and export the contents in Python?

I have a directory /directory/some_directory/ and in that directory I have a set of files. Those files are named in the following format: <letter>-<number>_<date>-<time>_<dataidentifier>.log, for example:
ABC1-123_20162005-171738_somestring.log
DE-456_20162005-171738_somestring.log
ABC1-123_20162005-153416_somestring.log
FG-1098_20162005-171738_somestring.log
ABC1-123_20162005-031738_somestring.log
DE-456_20162005-171738_somestring.log
I would like to read those a subset of those files (for example, read only files named as ABC1-123*.log) and export all their contents to a single csv file (for example, output.csv), that is, a CSV file that will have all the data from the inidividual files collectively.
The code that I have written so far:
#!/usr/bin/env python
import os
file_directory=os.getcwd()
m_class="ABC1"
m_id="123"
device=m_class+"-"+m_id
for data_file in sorted(os.listdir(file_dir)):
if str(device)+"*" in os.listdir(file_dir):
print data_file
I don't know how to read a only a subset of filtered files and also how to export them to a common csv file.
How can I achieve this?
just use re lib to match file name pattern, and use csv lib to export.
Only a few adjustments, You were close
filesFromDir = os.listdir(os.getcwd())
fileList = [file for file in filesFromDir if file.startswith(device)]
f = open("LogOutput.csv", "ab")
for file in fileList:
#print "Processing", file
with open(file, "rb") as log_file:
txt = log_file.read()
f.write(txt)
f.write("\n")
f.close()
Your question could be better stated, based on your current code snipet, I'll assume that you want to:
Filter files in a directory based on glob pattern.
Concatenate their contents to a file named output.csv.
In python you can achieve (1.) by using glob to list filenames.
import glob
for filename in glob.glob('foo*bar'):
print filename
That would print all files starting with foo and ending with bar in
the current directory.
For (2.) you just read the file and write its content to your desired
output, using python's open() builtin function:
open('filename', 'r')
(Using 'r' as the mode you are asking python to open the file for
"reading", using 'w' you are asking python to open the file for
"writing".)
The final code would look like the following:
import glob
import sys
device = 'ABC1-123'
with open('output.csv', 'w') as output:
for filename in glob.glob(device+'*'):
with open(filename, 'r') as input:
output.write(input.read())
You can use the os module to list the files.
import os
files = os.listdir(os.getcwd())
m_class = "ABC1"
m_id = "123"
device = m_class + "-" + m_id
file_extension = ".log"
# filter the files by their extension and the starting name
files = [x for x in files if x.startswith(device) and x.endswith(file_extension)]
f = open("output.csv", "a")
for file in files:
with open(file, "r") as data_file:
f.write(data_file.read())
f.write(",\n")
f.close()

Reading/Writing Text from Multiple Files to Master File

In the code below I'm trying to open a series of text files and copy their contents into a single file. I'm getting an error on the "os.write(out_file, line)" in which it asks me for an integer. I haven't defined what "line" is, so is that the problem? Do I need to specify somehow that "line" is a text string from the in_file? Also, I open the out_file through each iteration of the for-loop. Is that bad? Should I open it once at the beginning? Thanks!
import os
import os.path
import shutil
# This is supposed to read through all the text files in a folder and
# copy the text inside to a master file.
# This defines the master file and gets the source directory
# for reading/writing the files in that directory to the master file.
src_dir = r'D:\Term Search'
out_file = r'D:\master.txt'
files = [(path, f) for path,_,file_list in os.walk(src_dir) for f in file_list]
# This for-loop should open each of the files in the source directory, write
# their content to the master file, and finally close the in_file.
for path, f_name in files:
open(out_file, 'a+')
in_file = open('%s/%s' % (path, f_name), 'r')
for line in in_file:
os.write(out_file, line)
close(file_name)
close(out_file)
print 'Finished'
You're doing it wrong:
You did:
open(out_file, 'a+')
but that doesn't save the reference as a variable, so you have no way to access the file object you just created. What you need to do:
out_file_handle = open(out_file, 'a+')
...
out_file_handle.write(line)
...
out_file_handle.close()
Or, more pythonically:
out_filename = r"D:\master.txt"
...
with open(out_filename, 'a+') as outfile:
for filepath in files:
with open(os.path.join(*filepath)) as infile:
outfile.write(infile.read())
print "finished"

Categories