I created this code to get all excel files in a folder and make a csv file to every sheet in every file. This script works fine, but sometimes the last Excel file converted still locked by python on file system. Can anyone help me to understand what's happening?
import sys
from os import listdir
from os.path import isfile, join
import pandas as pd
import csv
import re
def removeEspecialCharacters(obj):
if isinstance(obj, str) :
retorno = re.sub('[(\x90|\x8F)]','',obj).replace("\r","").replace("\n","")
else:
retorno = obj
return retorno
myFolder = r'C:\Users\myuser\Downloads\ConvertFilesToCsv'
myFiles = [f for f in listdir(myFolder) if isfile(join(myFolder, f))]
for x in range(len(myFiles)):
if (myFiles[x].lower().endswith('.xls') or myFiles[x].lower().endswith('.xlsx') or myFiles[x].lower().endswith('.xlsb')):
print('Converting file: '+myFiles[x]);
if (myFiles[x].lower().endswith('.xlsb')):
file = pd.ExcelFile(myFolder+'\\'+myFiles[x], engine='pyxlsb')
else:
file = pd.ExcelFile(myFolder+'\\'+myFiles[x])
for mySheetName in file.sheet_names:
df = pd.read_excel(file, sheet_name=mySheetName)
df = df.applymap(removeEspecialCharacters)
csvFileName = myFolder+'\\'+myFiles[x].replace('.xlsx','').replace('.xlsb','').replace('.xls','')+'_'+mySheetName+'.csv'
df.to_csv(csvFileName,encoding='utf-8-sig',index=False,sep=",",quoting=csv.QUOTE_NONNUMERIC,quotechar="\"",escapechar="\"",decimal=".",date_format='%Y-%m-%d')#,quotechar='\'', escapechar='\\')
file.close()
file = ''
Note: this is a comment putting here for code format.
Your code looks fine to me. I would advise you to use context management, similar to the doc, like this:
for filename in myFiles:
extension = filename.split('.')[-1]
# you didn't seem to check xlsb in your code
if extension not in ['xls', 'xlsx', 'xlsb']:
continue
kwargs = {'engine': 'pyxlsb'} if extension=='xlsb' else {}
with pd.ExcelFile(myFolder + '\\' + filename, **kwargs) as file:
# do other stuff with file
...
# you don't need to close file here
# file.close()
Related
I have a Python script that reads a CSV file and outputs it to HTML.
I would like to change the order of the columns once written to the HTML file, but I don't know if I should do this on read or write or what function to use, here is my code so far.
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['OrderNo','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
html_table = csv.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)
Any help, greatly appreciated.
Thanks.
I've been looking at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html, but can't quite find out how to re-order the columns.
Edit,
Changes to get the solution are,
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#python can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['Order No','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
cols = list(csv.columns)
a, b, c, d, e, f, g = cols.index('Order No'), cols.index('Req Qty'), cols.index('Planned Start'), cols.index('Resource'), cols.index('Op'), cols.index('Part'), cols.index('Desc')
cols[a], cols[b], cols[c], cols[d], cols[e], cols[f], cols[g] = cols[a], cols[e], cols[f], cols[g], cols[c], cols[b], cols[d]
df = csv[cols]
html_table = df.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)
I have to convert json files as I said, here is the code:enter image description here
def AnalysisJson():
file_path = 'my_file'
for root,dirs,files in os.walk(file_path):
for file in files:
InputPath = open(file_path + '\\'+ file, encoding="utf-8")
for i in files:
df = json.load(InputPath)
demo = pd.json_normalize(df,record_path = 'label_annotations')
demo.to_csv('files.csv')
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
I want to convert these files, if the code is surely hard to run, I wish someone will give me an advice, thanks!
I am not sure that I understand correctly what you want, but here is an answer based on my interpretation of your question.
import json
import os
from glob import glob
import pandas as pd
def json_to_csv(dir_path: str) -> None:
for file_path in glob(os.path.join(dir_path, '*.json')):
with open(file_path, encoding='utf-8') as f:
data = json.load(f)
df = pd.json_normalize(data, record_path='label_annotations')
df.to_csv(file_path.replace('.json', '.csv'), index=False)
I am having some issues passing an argument in a python script to take a specific file like a csv, txt, or xml
I am reviewing python and would like some feedback on why I don't see any output after running the following command: ./my_script some3455.csv
#!/usr/bin/python
import sys
import csv
import xml.etree.ElementTree as ET
FILE = str(sys.argv[1])
def run_files():
if FILE == '*.csv'
run_csv()
elif FILE == '*.txt'
run_txt()
else
run_xml()
def run_csv():
csv_file = csv.register_dialect('dialect', delimiter = '|')
with open(FILE, 'r') as file:
reader = csv.reader(file, dialect='dialect')
for row in reader:
print(row)
def run_txt():
with open(FILE, 'r') as file:
txt_contents = file.read()
print(txt_contents)
def run_xml():
tree = ET.parse(FILE)
root = tree.getroot()
for child in root.findall('Attributes')
car = child.find('Car').text
color = child.find('Color').text
print(car, color)
I have tried to pass it as without the FILE but works just for one and the other file types doesn't get identify.
You need to use fnmatch and not == to compare a string with a glob pattern:
import fnmatch
def run_files():
if fnmatch.fnmatch(FILE, '*.csv'):
run_csv()
elif fnmatch.fnmatch(FILE, '*.txt'):
run_txt()
else:
run_xml()
I have main folder with many gz.tar compress files. So I need to unzip twice to get to a data file with text then I am extracting a certain string in the text. I am having trouble unzipping to get to the file with text then move to next file and do the same. Saving the results in a dataframe.
import os
import tarfile
for i in os.listdir(r'\user\project gz'):
tar = (i, "r:gz")
for m in tar.getmembers():
f= tar.extractfile(member):
if f is not None:
content = f.read()
text = re.findall(r"\name\s", content)
df = pd.Dataframe(text)
print(df)
I guess you want to find out file which contains the string \name\s in \user\project gz\*.tar.gz?
A solution is
import os
import re
import tarfile
import pandas as pd
row = []
value = []
for filename in os.listdir(r'\\user\\project gz'):
if filename.endswith('.tar.gz'):
tar = tarfile.open(r'\\user\\project gz' + filename)
for text_file in tar.getmembers():
f = tar.extractfile(text_file)
if f is not None:
content = f.read().decode()
if re.findall(r"\\name\\s", content):
row.append(text_file.name)
value.append(content)
tar.close()
df = pd.DataFrame(value, columns=['nametag'], index=row)
print(df)
Lets say I have n files in a directory with filenames: file_1.txt, file_2.txt, file_3.txt .....file_n.txt. I would like to import them into Python individually and then do some computation on them, and then store the results into n corresponding output files: file_1_o.txt, file_2_o.txt, ....file_n_o.txt.
I've figured out how to import multiple files:
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
...
np.savetxt(file, ) ???
Not quite sure how to append the _o.txt (or any string for that matter) after the filename so that the output file is file_1_o.txt
Can you use the following snippet to build the output filename?
parts = in_filename.split(".")
out_filename = parts[0] + "_o." + parts[1]
where I assumed in_filename is of the form "file_1.txt".
Of course would probably be better to put "_o." (the suffix before the extension) in a variable so that you can change at will just in one place and have the possibility to change that suffix more easily.
In your case it means
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
parts = file.split(".")
out_filename = parts[0] + "_o." + parts[1]
np.savetxt(out_filename, ) ???
but you need to be careful, since maybe before you pass out_filename to np.savetxt you need to build the full path so you might need to have something like
np.savetxt(os.path.join(path, out_filename), )
or something along those lines.
If you would like to combine the change in basically one line and define your "suffix in a variable" as I mentioned before you could have something like
hh = "_o." # variable suffix
..........
# inside your loop now
for file in allFiles:
out_filename = hh.join(file.split("."))
which uses another way of doing the same thing by using join on the splitted list, as mentioned by #NathanAck in his answer.
import os
#put the path to the files here
filePath = "C:/stack/codes/"
theFiles = os.listdir(filePath)
for file in theFiles:
#add path name before the file
file = filePath + str(file)
fileToRead = open(file, 'r')
fileData = fileToRead.read()
#DO WORK ON SPECIFIC FILE HERE
#access the file through the fileData variable
fileData = fileData + "\nAdd text or do some other operations"
#change the file name to add _o
fileVar = file.split(".")
newFileName = "_o.".join(fileVar)
#write the file with _o added from the modified data in fileVar
fileToWrite = open(newFileName, 'w')
fileToWrite.write(fileData)
#close open files
fileToWrite.close()
fileToRead.close()