Write file to directory based on variable in Python - python

The script will generate multiple files using the year and id variable. These files need to be placed into a folder matching year and id. How do I write them to the correct folders?
file_root_name = row["file_root_name"]
year = row["year"]
id = row["id"]
path = year+'-'+id
try:
os.makedirs(path)
except:
pass
output = open(row['file_root_name']+'.smil', 'w')
output.write(prettify(doctype, root))

If I understand your question correctly, you want to do this:
import os.path
file_name = row['file_root_name']+'.smil'
full_path = os.path.join(path, file_name)
output = open(full_path, 'w')
Please note that it's not very common in Python to use the + operator for string concatenation. Although not in your case, with large strings the method is not very fast.
I'd prefer:
file_name = '%s.smil' % row['file_root_name']
and:
path = '%i-%i' % (year, id)

Related

Create folders based on filenames

I have a folder with some 1500 excel files . The format of each file is something like this:
0d20170101abcd.xlsx
1d20170101ef.xlsx
0d20170104g.xlsx
0d20170109hijkl.xlsx
1d20170109mno.xlsx
0d20170110pqr.xlsx
The first character of the file name is either '0' or '1' followed by 'd' followed by the date when the file was created followed by customer id(abcd,ef,g,hijkl,mno,pqr).The customer id has no fixed length and it can vary.
I want to create folders for each unique date(folder name should be date) and move the files with the same date into a single folder .
So for the above example , 4 folders (20170101,20170104,20170109,20170110) has to be created with files with same dates copied into their respective folders.
I want to know if there is any way to do this in python ? Sorry for not posting any sample code because I have no idea as to how to start.
Try this out:
import os
import re
root_path = 'test'
def main():
# Keep track of directories already created
created_dirs = []
# Go through all stuff in the directory
file_names = os.listdir(root_path)
for file_name in file_names:
process_file(file_name, created_dirs)
def process_file(file_name, created_dirs):
file_path = os.path.join(root_path, file_name)
# Check if it's not itself a directory - safe guard
if os.path.isfile(file_path):
file_date, user_id, file_ext = get_file_info(file_name)
# Check we could parse the infos of the file
if file_date is not None \
and user_id is not None \
and file_ext is not None:
# Make sure we haven't already created the directory
if file_date not in created_dirs:
create_dir(file_date)
created_dirs.append(file_date)
# Move the file and rename it
os.rename(
file_path,
os.path.join(root_path, file_date, '{}.{}'.format(user_id, file_ext)))
print file_date, user_id
def create_dir(dir_name):
dir_path = os.path.join(root_path, dir_name)
if not os.path.exists(dir_path) or not os.path.isdir(dir_path):
os.mkdir(dir_path)
def get_file_info(file_name):
match = re.search(r'[01]d(\d{8})([\w+-]+)\.(\w+)', file_name)
if match:
return match.group(1), match.group(2), match.group(3)
return None, None, None
if __name__ == '__main__':
main()
Note that depending on the names of your files, you might want to change (in the future) the regex I use, i.e. [01]d(\d{8})([\w+-]+) (you can play with it and see details about how to read it here)...
Check this code.
import os
files = list(x for x in os.listdir('.') if x.is_file())
for i in files:
d = i[2:10] #get data from filename
n = i[10:] #get new filename
if os.path.isdir(i[2:10]):
os.rename(os.getcwd()+i,os.getcwd()+d+"/"+i)
else:
os.mkdir(os.getcwd()+i)
os.rename(os.getcwd()+i,os.getcwd()+d+"/"+i)
Here's is the repl link
Try this out :
import os, shutil
filepath = "your_file_path"
files = list(x for x in os.listdir(filepath) if x.endswith(".xlsx"))
dates = list(set(x[2:10] for x in files))
for j in dates:
os.makedirs(filepath + j)
for i in files:
cid = i[10:]
for j in dates:
if j in i:
os.rename(filepath+i,cid)
shutil.copy2(filepath+cid, filepath+j)

How to read files from two folders and avoid duplicates in Python

I have the following folders that I read SQL files from and save them as variables:
++folder1
-1.sql
-2.sql
-3.sql
++folder2
-2.sql
The following code does the job well for a single folder. How I can modify this code to read not just from one folder but from two with a rule that if a file exists in folder2 than don't read the file with the same name from folder1?
folder1 = '../folder1/'
for filename in os.listdir(folder1):
path = os.path.join(folder1, filename)
if os.path.isdir(path):
continue
with open(folder1 + filename, 'r') as myfile:
data = myfile.read()
query_name = filename.replace(".sql", "")
exec (query_name + " = data")
You can try something like follows:
folders = ['../folder2/','../folder1/']
checked =[]
for folder in folders:
for filename in os.listdir(folder):
if filename not in checked:
checked.append(filename)
path = os.path.join(folder, filename)
if os.path.isdir(path):
continue
with open(folder + filename, 'r') as myfile:
data = myfile.read()
query_name = filename.replace(".sql", "")
exec (query_name + " = data")
The answer to this is simple: Do two listdir calls, then skip over the files in folder1 that are also in folder2.
One way to do this is with set operations: the set difference a - b means all elements in a that are not also in b, which is exactly what you want.
files1 = set(os.listdir(folder1))
files2 = set(os.listdir(folder2))
files1 -= files2
paths1 = [os.path.join(folder1, file) for file in files1]
paths2 = [os.path.join(folder2, file) for file in files2]
for path in paths1 + paths2:
if os.path.isdir(path):
# etc.
As a side note, dynamically creating a bunch of variables like this is almost always a very bad idea, and doing it with exec instead of globals or setattr is an even worse idea. It's usually be much better to store everything in, e.g., a dict. For example:
queries = {}
for path in paths1 + paths2:
if os.path.isdir(path):
continue
name = os.path.splitext(os.path.basename(path))[0]
with open(path) as f:
queries[name] = f.read()

Exporting multiple files with different filenames

Lets say I have n files in a directory with filenames: file_1.txt, file_2.txt, file_3.txt .....file_n.txt. I would like to import them into Python individually and then do some computation on them, and then store the results into n corresponding output files: file_1_o.txt, file_2_o.txt, ....file_n_o.txt.
I've figured out how to import multiple files:
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
...
np.savetxt(file, ) ???
Not quite sure how to append the _o.txt (or any string for that matter) after the filename so that the output file is file_1_o.txt
Can you use the following snippet to build the output filename?
parts = in_filename.split(".")
out_filename = parts[0] + "_o." + parts[1]
where I assumed in_filename is of the form "file_1.txt".
Of course would probably be better to put "_o." (the suffix before the extension) in a variable so that you can change at will just in one place and have the possibility to change that suffix more easily.
In your case it means
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
parts = file.split(".")
out_filename = parts[0] + "_o." + parts[1]
np.savetxt(out_filename, ) ???
but you need to be careful, since maybe before you pass out_filename to np.savetxt you need to build the full path so you might need to have something like
np.savetxt(os.path.join(path, out_filename), )
or something along those lines.
If you would like to combine the change in basically one line and define your "suffix in a variable" as I mentioned before you could have something like
hh = "_o." # variable suffix
..........
# inside your loop now
for file in allFiles:
out_filename = hh.join(file.split("."))
which uses another way of doing the same thing by using join on the splitted list, as mentioned by #NathanAck in his answer.
import os
#put the path to the files here
filePath = "C:/stack/codes/"
theFiles = os.listdir(filePath)
for file in theFiles:
#add path name before the file
file = filePath + str(file)
fileToRead = open(file, 'r')
fileData = fileToRead.read()
#DO WORK ON SPECIFIC FILE HERE
#access the file through the fileData variable
fileData = fileData + "\nAdd text or do some other operations"
#change the file name to add _o
fileVar = file.split(".")
newFileName = "_o.".join(fileVar)
#write the file with _o added from the modified data in fileVar
fileToWrite = open(newFileName, 'w')
fileToWrite.write(fileData)
#close open files
fileToWrite.close()
fileToRead.close()

Python - Loop through list within regex

Right, i'm relatively new to Python, which you will likely see in my code, but is there any way to iterate through a list within regex?
Basically, i'm looping through each filename within a folder, getting a code (2-6 digits) from the filename, and i'm wanting to compare it with a list of codes in a text file, which have a name attached, in the format "1234_Name" (without the quotation marks). If the code exists in both lists, I want to print out the list entry, i.e. 1234_Name. Currently my code only seems to look at the first entry in the text file's list and i'm not sure how to make it look through them all to find matches.
import os, re
sitesfile = open('C:/Users/me/My Documents/WORK_PYTHON/Renaming/testnames.txt', 'r')
filefolder = r'C:/Users/me/My Documents/WORK_PYTHON/Renaming/files/'
sites = sitesfile.read()
site_split = re.split('\n', sites)
old = []
newname = []
for site in site_split:
newname.append(site)
for root, dirs, filenames in os.walk(filefolder):
for filename in filenames:
fullpath = os.path.join(root, filename)
filename_split = os.path.splitext(fullpath)
filename_zero, fileext = filename_split
filename_zs = re.split("/", filename_zero)
filenm = re.search(r"[\w]+", str(filename_zs[-1:]))#get only filename, not path
filenmgrp = filenm.group()
pacode = re.search('\d\d+', filenmgrp)
if pacode:
pacodegrp = pacode.group()
match = re.match(pacodegrp, site)
if match:
print site
Hope this makes sense - thanks a lot in advance!
So, use this code instead:
import os
import re
def locate(pattern = r'\d+[_]', root=os.curdir):
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in re.findall(pattern, ' '.join(files)):
yield os.path.join(path, filename)
..this will only return files in a folder that match a given regex pattern.
with open('list_file.txt', 'r') as f:
lines = [x.split('_')[0] for x in f.readlines()]
print_out = []
for f in locate(<your code regex>, <your directory>):
if f in lines: print_out.append(f)
print(print_out)
...find the valid codes in your list_file first, then compare the files that come back with your given regex.

Naming multiple files in python and scrapy

I'm trying to save files to a directory after scraping them from the web using scrapy. I'm extracting a date from the file and using that as the file name. The problem I'm running into, however, is that some files have the same date, i.e. there are two files that would take the name "June 2, 2009". So, what I'm looking to do is somehow check whether there is already a file with the same name, and if so, name it something like "June 2, 2009.1" or some such.
The code I'm using is as follows:
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
response = response.replace(body=response.body.replace('<br />', '\n'))
hxs = HtmlXPathSelector(response)
date = hxs.select("//div[#id='content']").extract()[0]
dateStrip = re.search(r"([A-Z]*|[A-z][a-z]+)\s\d*\d,\s[0-9]+", date)
newDate = dateStrip.group()
content = hxs.select("//div[#id='content']")
content = content.select('string()').extract()[0]
filename = ("/path/to/a/folder/ %s.txt") % (newDate)
with codecs.open(filename, 'w', encoding='utf-8') as output:
output.write(content)
You can use os.listdir to get a list of existing files and allocate a filename that will not cause conflict.
import os
def get_file_store_name(path, fname):
count = 0
for f in os.listdir(path):
if fname in f:
count += 1
return os.path.join(path, fname+str(count))
# This is example to use
print get_file_store_name(".", "README")+".txt"
The usual way to check for existence of a file in the C library is with a function called stat(). Python offers a thin wrapper around this function in the form of os.stat(). I suggest you use that.
http://docs.python.org/library/stat.html
def file_exists(fname):
try:
stat_info = os.stat(fname)
if os.S_ISREG(stat_info): # true for regular file
return True
except Exception:
pass
return False
one other solution is you can append time with date, for naming file like
from datetime import datetime
filename = ("/path/to/a/folder/ %s_%s.txt") % (newDate,datetime.now().strftime("%H%M%S"))
The other answer pointed me in the correct direction by checking into the os tools in python, but I think the way I found is perhaps more straightforward. Reference here How do I check whether a file exists using Python? for more.
The following is the code I came up with:
existence = os.path.isfile(filename)
if existence == False:
with codecs.open(filename, 'w', encoding='utf-8') as output:
output.write(content)
else:
newFilename = ("/path/.../.../- " + '%s' ".1.txt") % (newDate)
with codecs.open(newFilename, 'w', encoding='utf-8') as output:
output.write(content)
Edited to Add:
I didn't like this solution too much, and thought the other answer's solution was probably better but didn't quite work. The main part I didn't like about my solution was that it only worked with 2 files of the same name; if three or four files had the same name the initial problem would occur. The following is what I came up with:
filename = ("/Users/path/" + " " + "title " + '%s' + " " + "-1.txt") % (date)
filename = str(filename)
while True:
os.path.isfile(filename)
newName = filename.replace(".txt", "", filename)
newName = str.split(newName)
newName[-1] = str(int(newName[-1]) + 1)
filename = " ".join(newName) + ".txt"
if os.path.isfile(filename) == False:
with codecs.open(filename, 'w', encoding='utf-8') as output:
output.write(texts)
break
It probably isn't the most elegant and might be kind of a hackish approach, but it has worked so far and seems to have addressed my problem.

Categories