I am trying to generate dag files using python code below.
The code below takes two parameters -
bunch of looped json file input
Template which provides the line which the variables has to be applied
I can successfully create output files but the variables which is replicated from the template file did not change. When the file gets created I want the json variables to be passed to the new file created dynamically.
json file:
{
"DagId": "dag_file_xyz",
"Schedule": "'#daily'",
"Processed_file_name":"xyz1",
"Source_object_name":"'xyz2,}
Template:
processed_file = xyzOperator(
task_id=processed_file_name,
source_bucket=bucket_path,
destination_bucket=destination_bucket,
source_object=source_object_name,
destination_object=destination_object_name,
delimiter='.csv',
move_object=False
Generate file code
import json
import os
import shutil
import fileinput
import ctypes
config_filepath = ('C:\\xyz\\')
dag_template_filename = 'C:\\dagfile\\xyztest.py'
for filename in os.listdir(config_filepath):
print(filename)
f = open(config_filepath + filename)
print(f)
config = json.load(f)
new_filename = 'dags/' + config['DagId'] + '.py'
print(new_filename)
shutil.copyfile(dag_template_filename, new_filename)
for line in fileinput.input(new_filename, inplace=True):
print(line)
line.replace("dag_id", "'" + config['DagId'] + "'"))
line.replace("scheduletoreplace", config['Schedule'])
line.replace("processed_file_name", config['Processed_file_name'])
line.replace("source_object_name", config['Source_object_name'])
line.replace("destination_object_name", config['Destination_object_name'])
print(line, end="")
Related
I want to get all files in a directory (I reached it after doing several for loops - hence fourth.path) that ends with .npy or with csv and then zip those files.
My code is running putting one file only in the zip file. What am I doing wrong?
I tried to change my indents, but no zip file is being created
import json
import os
import zipfile
import zlib
directory = os.path.join(os.getcwd(), 'recs')
radarfolder = 'RadarIfxAvian'
file = os.listdir(directory)
def r(p, name):
p = os.path.join(p, name)
return p.replace("/", "\\")
#This code will list all json files in e ach file
for first in os.scandir(directory):
if first.is_dir():
for second in os.scandir(first.path):
if second.is_dir():
for third in os.scandir(second.path):
if third.is_dir():
radar_folder_name = ''
list_files = ()
for fourth in os.scandir(third.path):
if fourth.is_dir():
if radarfolder in fourth.path:
radar_folder_name = fourth.path
print(radar_folder_name)
list_files = ()
for file in os.listdir(fourth.path):
if file.endswith(".npy") | file.endswith(".csv"):
list_files = (file)
print(list_files)
with zipfile.ZipFile(radar_folder_name +'\\' +'radar.zip', 'w', compression=zipfile.ZIP_DEFLATED ) as zipMe:
zipMe.write(radar_folder_name +'\\' +list_files)
zipMe.close()
I tried to change my indents either resulting in error: TypeError: can only concatenate str (not "tuple") to str or no zip file being created
As I said in my second comment, your problem comes from the 'w' argument in your zipping statement. It causes the zip to be overwritten every time it's opened, which you do for each file you zip in. You can fix this 2 ways (at least):
Replace 'w' with 'a'; this way the files will be appended to your zip (with the side effect that, if you do this several times, files will be added more than once).
Keep the 'w', but only open the zip once, having listed all the files you want to zip before. See my code below.
I've taken the liberty to rewrite the part of your code where you look for the 'RadarIfxAvian' folder, since embedded for are clumsy (and if your folder structure changes, they might not work), replacing it with a multi-purpose recursive function.
Note that the folder structure will be included in the .zip; if you want to zip only the files themselves, consider doing os.chdir(radar_folder_name) before zipping the files.
# This function recursively looks for the 'filename' file or folder
# under 'start_path' and returns the full path, or an empty string if not found.
def find_file(start_path, filename):
if filename in os.listdir(start_path):
return start_path + '/' + filename
for file in os.scandir(start_path):
if not file.is_dir():
continue
if (deep_path:=find_file(start_path + '/' + file.name, filename)):
return deep_path
return ''
directory = os.path.join(os.getcwd(), 'recs')
radarfolder = 'RadarIfxAvian'
radar_folder_name = find_file(directory, radarfolder)
print(radar_folder_name)
list_files = []
for file in os.listdir(radar_folder_name):
if file.endswith(".npy") or file.endswith(".csv"):
list_files.append(file)
with zipfile.ZipFile(radar_folder_name + '/' + 'radar.zip', 'w', compression=zipfile.ZIP_DEFLATED ) as zipMe:
for file in list_files:
zipMe.write(radar_folder_name + '/' + file)
If I understand your code correctly, you are looking for a folder "RadarIfxAvian" and want to place a .ZIP in that folder containing any .CSV or .NPY files in that directory. This should do the equivalent, using os.walk for the recursive search:
import os
import zipfile
for path, dirs, files in os.walk('recs'):
if os.path.basename(path) == 'RadarIfxAvian':
print(path)
with zipfile.ZipFile(os.path.join(path, 'radar.zip'), 'w', zipfile.ZIP_DEFLATED) as zip:
for file in files:
if file.endswith(".npy") | file.endswith(".csv"):
print(file)
zip.write(file)
break # stop search once the directory is found and processed
I adjusted my code with the following steps:
Put the if in a function
writing the the zip by looping over each item in the list I appended
import json
import os
import glob
import zipfile
import zlib
directory = os.path.join(os.getcwd(), 'recs')
radarfolder = 'RadarIfxAvian'
file = os.listdir(directory)
list_files = []
def r(p, name):
p = os.path.join(p, name)
return p.replace("/", "\\")
def tozip(path, file):
filestozip = []
if file.endswith(".npy") or file.endswith(".csv"):
filestozip = (path + '\\' + file)
list_files.append(filestozip)
return list_files
#This code will list all json files in each file
for first in os.scandir(directory):
if first.is_dir():
for second in os.scandir(first.path):
if second.is_dir():
for third in os.scandir(second.path):
if third.is_dir():
radar_folder_name = ''
filestozip = []
list_files.clear()
for fourth in os.scandir(third.path):
if fourth.is_dir():
if radarfolder in fourth.path:
radar_folder_name = fourth.path
for file in os.listdir(fourth.path):
filestozip = tozip(radar_folder_name,file)
print(filestozip)
ZipFile = zipfile.ZipFile(r(radar_folder_name,"radar.zip"), "w")
for a in filestozip:
ZipFile.write(a, compress_type= zipfile.ZIP_DEFLATED)
print(radar_folder_name + "added to zip")
I created this code to get all excel files in a folder and make a csv file to every sheet in every file. This script works fine, but sometimes the last Excel file converted still locked by python on file system. Can anyone help me to understand what's happening?
import sys
from os import listdir
from os.path import isfile, join
import pandas as pd
import csv
import re
def removeEspecialCharacters(obj):
if isinstance(obj, str) :
retorno = re.sub('[(\x90|\x8F)]','',obj).replace("\r","").replace("\n","")
else:
retorno = obj
return retorno
myFolder = r'C:\Users\myuser\Downloads\ConvertFilesToCsv'
myFiles = [f for f in listdir(myFolder) if isfile(join(myFolder, f))]
for x in range(len(myFiles)):
if (myFiles[x].lower().endswith('.xls') or myFiles[x].lower().endswith('.xlsx') or myFiles[x].lower().endswith('.xlsb')):
print('Converting file: '+myFiles[x]);
if (myFiles[x].lower().endswith('.xlsb')):
file = pd.ExcelFile(myFolder+'\\'+myFiles[x], engine='pyxlsb')
else:
file = pd.ExcelFile(myFolder+'\\'+myFiles[x])
for mySheetName in file.sheet_names:
df = pd.read_excel(file, sheet_name=mySheetName)
df = df.applymap(removeEspecialCharacters)
csvFileName = myFolder+'\\'+myFiles[x].replace('.xlsx','').replace('.xlsb','').replace('.xls','')+'_'+mySheetName+'.csv'
df.to_csv(csvFileName,encoding='utf-8-sig',index=False,sep=",",quoting=csv.QUOTE_NONNUMERIC,quotechar="\"",escapechar="\"",decimal=".",date_format='%Y-%m-%d')#,quotechar='\'', escapechar='\\')
file.close()
file = ''
Note: this is a comment putting here for code format.
Your code looks fine to me. I would advise you to use context management, similar to the doc, like this:
for filename in myFiles:
extension = filename.split('.')[-1]
# you didn't seem to check xlsb in your code
if extension not in ['xls', 'xlsx', 'xlsb']:
continue
kwargs = {'engine': 'pyxlsb'} if extension=='xlsb' else {}
with pd.ExcelFile(myFolder + '\\' + filename, **kwargs) as file:
# do other stuff with file
...
# you don't need to close file here
# file.close()
I'm attempting to rename a file and am successful in doing so until the last line below. Using os.rename, no matter what I've tried (f strings for example) I just can't get it to work. I'm getting quadruple and double slashes in the output. I'm sure it's a simple resolution. Any help would be greatly appreciated.
import os
from os import path
import shutil
from datetime import date
WDPath = r'\\xxx\yyy\gis\SAP_IMPORT_SQLARCGIS'
ARCHIVE_PATH = r'\\xxx\yyy\gis\SAP_IMPORT_SQLARCGIS\ARCHIVE\GISDEVICES'
IMPORT_FILE_NAME = r'\GISDEVICES.txt'
IMPORT_FILE = os.path.join(f'{WDPath}{IMPORT_FILE_NAME}')
print(IMPORT_FILE, '\n')
# split the file into filename and extension
filename, extension = os.path.splitext(IMPORT_FILE_NAME)
print(filename, extension, '\n')
# Get the create time of the file
create_time = os.path.getctime(IMPORT_FILE)
print(create_time, '\n')
# get the readable timestamp format
## format_time = datetime.datetime.fromtimestamp(create_time)
format_time = date.fromtimestamp(create_time)
print(format_time, '\n')
# convert time into string
format_time_string = format_time.strftime("-%Y-%m-%d")
print(format_time_string, '\n')
# Contruct the new name of the file
newfile = filename + format_time_string + extension
print(newfile, '\n')
# rename the file
os.rename(IMPORT_FILE, newfile)
print(IMPORT_FILE)
RESULT >>
\\nasgriver\SAP_Share\gis\SAP_IMPORT_SQLARCGIS\GISDEVICES.txt
\GISDEVICES .txt
1614332702.039849
2021-02-26
-2021-02-26
\GISDEVICES-2021-02-26.txt
Traceback (most recent call last):
File "\\xxx\yyy\gis\SAP_IMPORT_SQLARCGIS\MOVERENAMEFILE.py", line 42, in <module>
os.rename(IMPORT_FILE, newfile)
OSError: [WinError 17] The system cannot move the file to a different disk drive: '\\\\xxx\\yyy\\gis\\SAP_IMPORT_SQLARCGIS\\GISDEVICES.txt' -> '\\GISDEVICES-2021-02-26.txt'
Using the path submodule of os can come with some annoyance, you can try with pathlib instead:
import os
from pathlib import Path
import shutil
from datetime import date
WDPath = Path(r"\\xxx\yyy\gis\SAP_IMPORT_SQLARCGIS")
ARCHIVE_PATH = Path(r"\\xxx\yyy\gis\SAP_IMPORT_SQLARCGIS\ARCHIVE)\GISDEVICES")
IMPORT_FILE_NAME = "GISDEVICES.txt"
IMPORT_FILE = WDPath / IMPORT_FILE_NAME
print(IMPORT_FILE, '\n')
# split the file into filename and extension
filename, extension = os.path.splitext(IMPORT_FILE_NAME)
print(filename, extension, '\n')
# Get the create time of the file
create_time = os.path.getctime(IMPORT_FILE)
print(create_time, '\n')
# get the readable timestamp format
## format_time = datetime.datetime.fromtimestamp(create_time)
format_time = date.fromtimestamp(create_time)
print(format_time, '\n')
# convert time into string
format_time_string = format_time.strftime("-%Y-%m-%d")
print(format_time_string, '\n')
# Contruct the new name of the file
newfile = filename + format_time_string + extension
print(newfile, '\n')
# rename the file
IMPORT_FILE.rename(IMPORT_FILE / newfile)
print(IMPORT_FILE)
Lets say I have n files in a directory with filenames: file_1.txt, file_2.txt, file_3.txt .....file_n.txt. I would like to import them into Python individually and then do some computation on them, and then store the results into n corresponding output files: file_1_o.txt, file_2_o.txt, ....file_n_o.txt.
I've figured out how to import multiple files:
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
...
np.savetxt(file, ) ???
Not quite sure how to append the _o.txt (or any string for that matter) after the filename so that the output file is file_1_o.txt
Can you use the following snippet to build the output filename?
parts = in_filename.split(".")
out_filename = parts[0] + "_o." + parts[1]
where I assumed in_filename is of the form "file_1.txt".
Of course would probably be better to put "_o." (the suffix before the extension) in a variable so that you can change at will just in one place and have the possibility to change that suffix more easily.
In your case it means
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
parts = file.split(".")
out_filename = parts[0] + "_o." + parts[1]
np.savetxt(out_filename, ) ???
but you need to be careful, since maybe before you pass out_filename to np.savetxt you need to build the full path so you might need to have something like
np.savetxt(os.path.join(path, out_filename), )
or something along those lines.
If you would like to combine the change in basically one line and define your "suffix in a variable" as I mentioned before you could have something like
hh = "_o." # variable suffix
..........
# inside your loop now
for file in allFiles:
out_filename = hh.join(file.split("."))
which uses another way of doing the same thing by using join on the splitted list, as mentioned by #NathanAck in his answer.
import os
#put the path to the files here
filePath = "C:/stack/codes/"
theFiles = os.listdir(filePath)
for file in theFiles:
#add path name before the file
file = filePath + str(file)
fileToRead = open(file, 'r')
fileData = fileToRead.read()
#DO WORK ON SPECIFIC FILE HERE
#access the file through the fileData variable
fileData = fileData + "\nAdd text or do some other operations"
#change the file name to add _o
fileVar = file.split(".")
newFileName = "_o.".join(fileVar)
#write the file with _o added from the modified data in fileVar
fileToWrite = open(newFileName, 'w')
fileToWrite.write(fileData)
#close open files
fileToWrite.close()
fileToRead.close()
I am trying to assign the elements of a list as names for some files that live in a directory, so far I created a function that recover the name of a each file from a directory and returns them in a list:
def retrive(directory_path):
path_names = []
for filename in sorted(glob.glob(os.path.join(directory_path, '*.pdf'))):
retrieved_files = filename.split('/')[-1]
path_names.append(retrieved_files)
print (path_names)
The above function returns in a list the names of each file, then I am writing the files into another directory as follows:
path = os.path.join(new_dir_path, "list%d.txt" % i)
#This is the path of each new file:
#print(path)
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
Finally, my question is: how can I assign as a name of each file, each element of path_names?, something like this line:
path = os.path.join(new_dir_path, "list%d.txt" % i)
I also tried to use the format() function. However I still cant assign the the correct name to each file.
Here's the full script:
def transform_directoy(input_directory, output_directory):
import codecs, glob, os
from tika import parser
all_texts = []
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
texts = parsed['content']
all_texts.append(texts)
for i , a_list in enumerate(all_texts):
new_dir_path = output_directory
#print(new_dir_path)
path = os.path.join(new_dir_path, "list%d.txt" % i)
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
The desired output will consist of the actual names of each processed file.
You’re almost there:
for path_name in path_names:
path = os.path.join(new_dir_path, "list%s.txt" % path_name)
#This is the path of each new file:
#print(path)
with codecs.open(path, "w", encoding='utf8') as f:
for item in [a_list]:
f.write(item+"\n")
Update based on updated code sample. You are using different loops here, and that is not ideal unless you are doing processing in between the two loops. Since I am going to keep that structure, we are going to have to make sure to associate each block of content with the original filename. The best structure for that is a dict, and in case order is important, we use an OrderedDict. Now, when we’re looping over the filename, content pairs in the OrderedDict we’ll want to change the extension of the file to match the new file type. Luckily, python has some nice utilities for file/path manipulation in the os.path module. os.path.basename can be used to strip off the directory from a file and os.path.splitext will strip off an extension from a filename. We use both of those to get just the filename without the extension and then append .txt to designate the new file type. Putting it all together, we get :
def transform_directoy(input_directory, output_directory):
import codecs, glob, os
from collections import OrderedDict
from tika import parser
all_texts = OrderedDict()
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
filename = os.path.basename(filename)
texts = parsed['content']
all_texts[filename] = texts
for i, (original_filename, a_list) in enumerate(all_texts.items()):
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
new_dir_path = output_directory
#print(new_dir_path)
path = os.path.join(new_dir_path, new_filename)
# Print out the name of the file we are processing
print('Transforming %s => %s' % (original_filename, path,))
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
Second update: OP asked how I would write this code if this was all that there was, so here goes:
# move imports to top of file: PEP 8
import codecs, glob, os
from tika import parser
def transform_directoy(input_directory, output_directory):
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
parsed_content = parsed['content']
original_filename = os.path.basename(filename)
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
path = os.path.join(output_directory, new_filename)
# Print out the name of the file we are processing
print('Transforming %s => %s' % (original_filename, path,))
# no need for a second loop since we can piggy back off the first loop
with codecs.open(path, "w", encoding='utf8') as filename:
# No need for a for loop here since our list only has one item
filename.write(parsed_content)
filename.write("\n")